From ab8d3554bb134fd96e622cd3c0a13406adbdff34 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 15:45:52 -0700
Subject: [PATCH 01/27] Bump vortex to tensor-decoupled

---
 src/main/resources/vsrc/vortex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex
index da54162..4dcbc31 160000
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
@@ -1 +1 @@
-Subproject commit da54162241da020807274bd4087844d379d8170e
+Subproject commit 4dcbc31a88915fff35ccefd00c6e753fa5ef135a

From 2ca2ee37b0fffeb7225940b03c206f3237f10b85 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 15:45:59 -0700
Subject: [PATCH 02/27] tensor: Fix writeback datawidth

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 617659d..65246f6 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -42,7 +42,7 @@ class TensorCoreDecoupled(
     val writeback = Decoupled(new Bundle {
       val last = Bool()
       val wid = UInt(numWarpBits.W)
-      val data = Vec(numLanes, UInt(wordSize.W))
+      val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W))
     })
     val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
     val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
@@ -135,7 +135,7 @@ class TensorCoreDecoupled(
 
   // Execute stage
   // -------------
-  // Execute backend of the decoupled access/execute pipeline.
+  // Backend of the decoupled access/execute pipeline.
   //
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(io.respA, respQueueDepth)
@@ -144,7 +144,7 @@ class TensorCoreDecoupled(
   respQueueB.ready := io.writeback.ready // FIXME
 
   require(respQueueA.bits.data.widthOption.get ==
-          io.writeback.bits.data.widthOption.get * numLanes,
+          io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
   // FIXME: debug dummy: pipe A directly to writeback

From de393115cd97f812ceb91ef94598ca8d46570202 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 16:48:39 -0700
Subject: [PATCH 03/27] tensor: Translate TL response source to set/step tag

---
 .../radiance/core/TensorCoreDecoupled.scala   | 79 +++++++++++++------
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 65246f6..43dc1ca 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -32,8 +32,8 @@ class TensorCoreDecoupled(
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
-  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
+  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
 
   val io = IO(new Bundle {
     val initiate = Flipped(Decoupled(new Bundle {
@@ -51,6 +51,27 @@ class TensorCoreDecoupled(
   })
   dontTouch(io)
 
+  class TensorMemReq(
+    sourceWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val address = UInt(32.W)
+  }
+  class TensorMemResp(
+    sourceWidth: Int,
+    dataWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val data = UInt(dataWidth.W)
+  }
+  // mem response after translation from TL source to set/step tag
+  class TensorMemRespWithTag(
+    dataWidth: Int
+  ) extends Bundle {
+    val tag = new TensorMemTag
+    val data = UInt(dataWidth.W)
+  }
+
   // FSM
   // ---
   // This drives the overall pipeline of memory requests, dot-product unit
@@ -101,18 +122,39 @@ class TensorCoreDecoupled(
   //
   val genReq = (state === TensorState.run)
 
-  Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach {
-    case (req, resp) => {
-      val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds)))
+  class TensorMemTag extends Bundle {
+    val set = UInt(setBits.W)
+    val step = UInt(stepBits.W)
+  }
+  // use concatenation of set/step as the memory request source.  This will get
+  // translated to the actual TL sourcewidth in sourceGen.
+  val tag = Wire(new TensorMemTag)
+  tag.set := set
+  tag.step := step
+
+  val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
+  val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
+  Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach {
+    case (req, (resp, respTagged)) => {
+      val sourceGen = Module(new SourceGenerator(
+        log2Ceil(numSourceIds),
+        metadata = Some(tag)
+      ))
 
       sourceGen.io.gen := req.fire
-      sourceGen.io.meta := DontCare
+      sourceGen.io.meta := tag
       req.valid := genReq
       req.bits.address := 0.U // FIXME
       req.bits.source := sourceGen.io.id.bits
 
       sourceGen.io.reclaim.valid := resp.fire
       sourceGen.io.reclaim.bits := resp.bits.source
+
+      // translate source
+      respTagged.valid := resp.valid
+      respTagged.bits.tag := sourceGen.io.peek
+      respTagged.bits.data := resp.bits.data
+      resp.ready := respTagged.ready
     }
   }
 
@@ -130,16 +172,13 @@ class TensorCoreDecoupled(
     firedABReg := Seq(false.B, false.B)
   }
 
-  io.respA.ready := true.B // FIXME
-  io.respB.ready := true.B // FIXME
-
   // Execute stage
   // -------------
   // Backend of the decoupled access/execute pipeline.
   //
   val respQueueDepth = 4 // FIXME: parameterize
-  val respQueueA = Queue(io.respA, respQueueDepth)
-  val respQueueB = Queue(io.respB, respQueueDepth)
+  val respQueueA = Queue(respATagged, respQueueDepth)
+  val respQueueB = Queue(respBTagged, respQueueDepth)
   respQueueA.ready := io.writeback.ready // FIXME
   respQueueB.ready := io.writeback.ready // FIXME
 
@@ -149,9 +188,11 @@ class TensorCoreDecoupled(
 
   // FIXME: debug dummy: pipe A directly to writeback
   io.writeback.valid := respQueueA.valid
-  val groupedRespA = respQueueA.bits.data.asBools.grouped(wordSize * 8/*bits*/)
+  val groupedRespA = respQueueA.bits.data
+                     .asBools.grouped(wordSize * 8/*bits*/)
+                     .map(VecInit(_).asUInt)
   (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) =>
-    wb := VecInit(data).asUInt
+    wb := data
   }
 
   // State transition
@@ -204,20 +245,6 @@ class TensorCoreDecoupled(
   // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 }
 
-class TensorMemReq(
-  sourceWidth: Int
-) extends Bundle {
-  val source = UInt(sourceWidth.W)
-  val address = UInt(32.W)
-}
-class TensorMemResp(
-  sourceWidth: Int,
-  dataWidth: Int
-) extends Bundle {
-  val source = UInt(sourceWidth.W)
-  val data = UInt(dataWidth.W)
-}
-
 // synthesizable unit tests
 
 // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy

From efaf599fbe679f0e5e7ef671522408f34984057e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 17:08:14 -0700
Subject: [PATCH 04/27] tensor: Assert alignment of A and B response queues

---
 .../radiance/core/TensorCoreDecoupled.scala   | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 43dc1ca..4f5ecb3 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -97,15 +97,16 @@ class TensorCoreDecoupled(
   // steps: i-j iteration
   val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
   val stepBits = log2Ceil(numSteps)
-  val set = RegInit(0.U(setBits.W))
-  val step = RegInit(0.U(stepBits.W))
+  // set and step being currently accessed in the acc/ex frontend
+  val setAccess = RegInit(0.U(setBits.W))
+  val stepAccess = RegInit(0.U(stepBits.W))
 
   when(io.initiate.fire) {
     val wid = io.initiate.bits.wid
     busy := true.B
     warpReg := wid
-    set := 0.U
-    step := 0.U
+    setAccess := 0.U
+    stepAccess := 0.U
     when(io.writeback.fire) {
       assert(
         io.writeback.bits.wid =/= wid,
@@ -129,8 +130,8 @@ class TensorCoreDecoupled(
   // use concatenation of set/step as the memory request source.  This will get
   // translated to the actual TL sourcewidth in sourceGen.
   val tag = Wire(new TensorMemTag)
-  tag.set := set
-  tag.step := step
+  tag.set := setAccess
+  tag.step := stepAccess
 
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
@@ -176,16 +177,32 @@ class TensorCoreDecoupled(
   // -------------
   // Backend of the decoupled access/execute pipeline.
   //
+  // set and step being currently executed in the acc/ex backend
+  val setExecute = RegInit(0.U(setBits.W))
+  val stepExecute = RegInit(0.U(stepBits.W))
+
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
-  respQueueA.ready := io.writeback.ready // FIXME
-  respQueueB.ready := io.writeback.ready // FIXME
 
   require(respQueueA.bits.data.widthOption.get ==
           io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
+  val bothQueueValid = (respQueueA.valid && respQueueB.valid)
+  // assume in-order response and that A/B responses are always aligned; this
+  // might be too strong an assumption depending on the backing memory
+  when (bothQueueValid) {
+    assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
+           (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
+           "A and B response queue pointing to different set/steps. " ++
+           "This might indicate memory response coming back out-of-order.")
+  }
+  // synchronized dequeue
+  val deqResp = bothQueueValid && io.writeback.ready
+  respQueueA.ready := deqResp
+  respQueueB.ready := deqResp
+
   // FIXME: debug dummy: pipe A directly to writeback
   io.writeback.valid := respQueueA.valid
   val groupedRespA = respQueueA.bits.data
@@ -201,12 +218,12 @@ class TensorCoreDecoupled(
   // set/step sequencing logic
   val lastSet = ((1 << setBits) - 1)
   val lastStep = ((1 << stepBits) - 1)
-  val setDone = (set === lastSet.U)
-  val stepDone = (step === lastStep.U)
+  val setDone = (setAccess === lastSet.U)
+  val stepDone = (stepAccess === lastStep.U)
   when (nextStep) {
-    step := (step + 1.U) & lastStep.U
+    stepAccess := (stepAccess + 1.U) & lastStep.U
     when (stepDone) {
-      set := (set + 1.U) & lastSet.U
+      setAccess := (setAccess + 1.U) & lastSet.U
     }
   }
 

From e2abe1cffdc3a658b0acc5b2cb36a82d5a3450ec Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 19:12:15 -0700
Subject: [PATCH 05/27] tensor: Sequence set/steps in the execute-side

---
 .../radiance/core/TensorCoreDecoupled.scala   | 52 ++++++++++++-------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 4f5ecb3..7c07564 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -97,9 +97,16 @@ class TensorCoreDecoupled(
   // steps: i-j iteration
   val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
   val stepBits = log2Ceil(numSteps)
+  val lastSet = ((1 << setBits) - 1)
+  val lastStep = ((1 << stepBits) - 1)
+  def setDone(set: UInt) = (set === lastSet.U)
+  def stepDone(step: UInt) = (step === lastStep.U)
+
   // set and step being currently accessed in the acc/ex frontend
   val setAccess = RegInit(0.U(setBits.W))
   val stepAccess = RegInit(0.U(stepBits.W))
+  dontTouch(setAccess)
+  dontTouch(stepAccess)
 
   when(io.initiate.fire) {
     val wid = io.initiate.bits.wid
@@ -118,6 +125,9 @@ class TensorCoreDecoupled(
     busy := false.B
   }
 
+  // serialize every HGMMA request
+  io.initiate.ready := !busy
+
   // Memory traffic generation
   // -------------------------
   //
@@ -166,10 +176,10 @@ class TensorCoreDecoupled(
     req.fire
   })
   val firedAB = (firedABNow.asUInt | firedABReg.asUInt)
-  val nextStep = firedAB.andR
+  val nextStepAccess = firedAB.andR
   // clear out firedABReg every step.  this will overwrite the previous fired
   // write upon the last fire out of A and B
-  when (nextStep) {
+  when (nextStepAccess) {
     firedABReg := Seq(false.B, false.B)
   }
 
@@ -180,6 +190,8 @@ class TensorCoreDecoupled(
   // set and step being currently executed in the acc/ex backend
   val setExecute = RegInit(0.U(setBits.W))
   val stepExecute = RegInit(0.U(stepBits.W))
+  dontTouch(setExecute)
+  dontTouch(stepExecute)
 
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
@@ -198,13 +210,19 @@ class TensorCoreDecoupled(
            "A and B response queue pointing to different set/steps. " ++
            "This might indicate memory response coming back out-of-order.")
   }
-  // synchronized dequeue
+  // dequeue is synchronized between A and B
+  // FIXME: this need to change to dpu_ready
   val deqResp = bothQueueValid && io.writeback.ready
   respQueueA.ready := deqResp
   respQueueB.ready := deqResp
+  // FIXME: this need to change to dpu_fire
+  val nextStepExecute = io.writeback.fire
+
+  io.writeback.valid := bothQueueValid
+  io.writeback.bits.wid := warpReg
+  io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
 
   // FIXME: debug dummy: pipe A directly to writeback
-  io.writeback.valid := respQueueA.valid
   val groupedRespA = respQueueA.bits.data
                      .asBools.grouped(wordSize * 8/*bits*/)
                      .map(VecInit(_).asUInt)
@@ -216,16 +234,17 @@ class TensorCoreDecoupled(
   // ----------------
   //
   // set/step sequencing logic
-  val lastSet = ((1 << setBits) - 1)
-  val lastStep = ((1 << stepBits) - 1)
-  val setDone = (setAccess === lastSet.U)
-  val stepDone = (stepAccess === lastStep.U)
-  when (nextStep) {
-    stepAccess := (stepAccess + 1.U) & lastStep.U
-    when (stepDone) {
-      setAccess := (setAccess + 1.U) & lastSet.U
+
+  def sequenceSetStep(set: UInt, step: UInt, nextStep: Bool) = {
+    when (nextStep) {
+      step := (step + 1.U) & lastStep.U
+      when (stepDone(step)) {
+        set := (set + 1.U) & lastSet.U
+      }
     }
   }
+  sequenceSetStep(setAccess, stepAccess, nextStepAccess)
+  sequenceSetStep(setExecute, stepExecute, nextStepExecute)
 
   switch(state) {
     is(TensorState.idle) {
@@ -234,7 +253,7 @@ class TensorCoreDecoupled(
       }
     }
     is(TensorState.run) {
-      when (setDone && stepDone && nextStep) {
+      when (setDone(setAccess) && stepDone(stepAccess) && nextStepAccess) {
         when (state === TensorState.run) {
           state := TensorState.finish
         }
@@ -247,11 +266,6 @@ class TensorCoreDecoupled(
     }
   }
 
-  io.initiate.ready := !busy
-  io.writeback.valid := (state === TensorState.finish)
-  io.writeback.bits.wid := warpReg
-  io.writeback.bits.last := false.B // TODO
-
   // Writeback queues
   // ----------------
   // These queues hold the metadata necessary for register
@@ -328,7 +342,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
   tensor.io.initiate.bits.wid := 0.U // FIXME
   tensor.io.writeback.ready := true.B
 
-  io.finished := tensor.io.writeback.valid
+  io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last
 }
 
 // a minimal Diplomacy graph with a tensor core and a TLRAM

From 444dd5d7e1c54ab78111fdcfac9ebf3145809f02 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 14:25:38 -0700
Subject: [PATCH 06/27] tensor: Add destination reg to IO

---
 .../scala/radiance/core/TensorCoreDecoupled.scala  | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 7c07564..92f98b7 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -28,12 +28,14 @@ class TensorCoreDecoupled(
     val numWarps: Int,
     val numLanes: Int,
     val numSourceIds: Int,
-    val tilingParams: TensorTilingParams
+    val tilingParams: TensorTilingParams,
+    val numFPRegs: Int = 32
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
   val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
+  val numFPRegBits = log2Ceil(numFPRegs)
 
   val io = IO(new Bundle {
     val initiate = Flipped(Decoupled(new Bundle {
@@ -42,6 +44,7 @@ class TensorCoreDecoupled(
     val writeback = Decoupled(new Bundle {
       val last = Bool()
       val wid = UInt(numWarpBits.W)
+      val rd = UInt(numFPRegBits.W)
       val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W))
     })
     val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
@@ -218,8 +221,17 @@ class TensorCoreDecoupled(
   // FIXME: this need to change to dpu_fire
   val nextStepExecute = io.writeback.fire
 
+  def rdGen(set: UInt, step: UInt): UInt = {
+    // each step produces 4x4 output tile, written by 8 threads with 2 regs per
+    // thread
+    require(numLanes == 8, "currently assumes 8-wide warps")
+    (Cat(set, step) >> 1/*2 regs/thread*/)
+    // FIXME: add substep here
+  }
+
   io.writeback.valid := bothQueueValid
   io.writeback.bits.wid := warpReg
+  io.writeback.bits.rd := rdGen(setExecute, stepExecute)
   io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
 
   // FIXME: debug dummy: pipe A directly to writeback

From 77dae3e1f9941d15c213b19a43cd82bd0e00c81c Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 21:21:48 -0700
Subject: [PATCH 07/27] tensor: Write staging pipeline for A tile

---
 .../radiance/core/TensorCoreDecoupled.scala   | 103 ++++++++++++++----
 src/main/scala/radiance/core/TensorDPU.scala  |   1 +
 2 files changed, 83 insertions(+), 21 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 92f98b7..69b84f9 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -108,8 +108,12 @@ class TensorCoreDecoupled(
   // set and step being currently accessed in the acc/ex frontend
   val setAccess = RegInit(0.U(setBits.W))
   val stepAccess = RegInit(0.U(stepBits.W))
+  // we need full 4x4 A tile to fire DPU, but since the memory width is 8
+  // words, we need 2 cycles to read A.  `substep` tells which cycle we're at.
+  val substepAccess = RegInit(0.U(1.W))
   dontTouch(setAccess)
   dontTouch(stepAccess)
+  dontTouch(substepAccess)
 
   when(io.initiate.fire) {
     val wid = io.initiate.bits.wid
@@ -139,16 +143,19 @@ class TensorCoreDecoupled(
   class TensorMemTag extends Bundle {
     val set = UInt(setBits.W)
     val step = UInt(stepBits.W)
+    val substep = UInt(1.W)
   }
   // use concatenation of set/step as the memory request source.  This will get
   // translated to the actual TL sourcewidth in sourceGen.
   val tag = Wire(new TensorMemTag)
   tag.set := setAccess
   tag.step := stepAccess
+  tag.substep := substepAccess
 
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
-  Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach {
+  Seq((io.reqA, (io.respA, respATagged)),
+      (io.reqB, (io.respB, respBTagged))).foreach {
     case (req, (resp, respTagged)) => {
       val sourceGen = Module(new SourceGenerator(
         log2Ceil(numSourceIds),
@@ -173,18 +180,22 @@ class TensorCoreDecoupled(
   }
 
   // only advance to the next step if we fired mem requests for both A and B
+  // TODO: @perf: too strict? should be able to have A and B progress
+  // separately
   val firedABReg = RegInit(VecInit(false.B, false.B))
   val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map {
     case (req, fired) => { when (req.fire) { fired := true.B } }
     req.fire
   })
   val firedAB = (firedABNow.asUInt | firedABReg.asUInt)
-  val nextStepAccess = firedAB.andR
-  // clear out firedABReg every step.  this will overwrite the previous fired
-  // write upon the last fire out of A and B
-  when (nextStepAccess) {
+  val nextSubstepAccess = firedAB.andR
+  val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U)
+  // clear out firedABReg every substep
+  when (nextSubstepAccess) {
     firedABReg := Seq(false.B, false.B)
+    substepAccess := substepAccess + 1.U
   }
+  require(substepAccess.widthOption.get == 1, "there should be only two substeps")
 
   // Execute stage
   // -------------
@@ -204,22 +215,72 @@ class TensorCoreDecoupled(
           io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
-  val bothQueueValid = (respQueueA.valid && respQueueB.valid)
-  // assume in-order response and that A/B responses are always aligned; this
-  // might be too strong an assumption depending on the backing memory
-  when (bothQueueValid) {
-    assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
-           (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
-           "A and B response queue pointing to different set/steps. " ++
-           "This might indicate memory response coming back out-of-order.")
-  }
-  // dequeue is synchronized between A and B
   // FIXME: this need to change to dpu_ready
-  val deqResp = bothQueueValid && io.writeback.ready
-  respQueueA.ready := deqResp
-  respQueueB.ready := deqResp
-  // FIXME: this need to change to dpu_fire
-  val nextStepExecute = io.writeback.fire
+  val dpuReady = io.writeback.ready // FIXME: this need be actual dpu
+
+  val substepExecute = RegInit(0.U(1.W))
+  when (respQueueA.fire) {
+    substepExecute := substepExecute + 1.U
+  }
+  dontTouch(substepExecute)
+
+  // note combinationally coupled ready with `pipe`
+  val halfAQueue = Module(new Queue(
+    chiselTypeOf(respQueueA.bits.data), entries = 1, pipe = true
+  ))
+  halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U)
+  halfAQueue.io.enq.bits := respQueueA.bits.data
+
+  // we need the full data for A because we divide the D tile by half along N;
+  // for B, the DPU can immediately start computing with a 4x2 tile.
+  //
+  // substep == 0 data goes to the LSB
+  val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits)
+  val fullAQueue = Module(new Queue(
+    chiselTypeOf(fullAEnqData), entries = 1, pipe = true
+  ))
+  // hold first half A data for the first substep
+  halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) &&
+                             fullAQueue.io.enq.ready
+
+  require(fullAEnqData.widthOption.get == dataWidth * 2,
+          "assumes 2-cycle read for a full compute tile of A")
+  fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) &&
+                             halfAQueue.io.deq.valid
+  fullAQueue.io.enq.bits := fullAEnqData
+
+  val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME?
+  val dpuFire = operandsValid && dpuReady
+  fullAQueue.io.deq.ready := dpuFire
+  val nextStepExecute = dpuFire
+
+  // FIXME: need to hold A for two cycles!!
+
+  // make sure to dequeue from response queues only when both A and B valid
+  respQueueA.ready := MuxCase(false.B,
+                              Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
+                                  (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
+  respQueueB.ready := dpuFire
+  dontTouch(respQueueA)
+  dontTouch(respQueueB)
+
+  // assert that the A and B response queue heads always point to the same
+  // set/step/substep
+  //
+  // this assumes that memory responses come back in-order.  this might be too
+  // strong an assumption depending on the backing memory
+  def assertAligned = {
+    val bothQueueValid = (respQueueA.valid && respQueueB.valid)
+    when (bothQueueValid && (substepExecute === 0.U)) {
+      assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
+        (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
+        "A and B response queue pointing to different set/steps. " ++
+        "This might indicate memory response coming back out-of-order.")
+    }
+    dontTouch(respQueueA.bits.tag)
+    dontTouch(respQueueB.bits.tag)
+  }
+  assertAligned
 
   def rdGen(set: UInt, step: UInt): UInt = {
     // each step produces 4x4 output tile, written by 8 threads with 2 regs per
@@ -229,7 +290,7 @@ class TensorCoreDecoupled(
     // FIXME: add substep here
   }
 
-  io.writeback.valid := bothQueueValid
+  io.writeback.valid := operandsValid // FIXME: bypass logic
   io.writeback.bits.wid := warpReg
   io.writeback.bits.rd := rdGen(setExecute, stepExecute)
   io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala
index 4e6cee7..a82bed7 100644
--- a/src/main/scala/radiance/core/TensorDPU.scala
+++ b/src/main/scala/radiance/core/TensorDPU.scala
@@ -27,6 +27,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
       val b = Vec(dotProductDim, Bits((inFLen).W))
       val c = Bits((outFLen).W) // note C has the out length for accumulation
     }))
+    // 'stall' is effectively out.ready, combinationally coupled to in.ready
     val stall = Input(Bool())
     val out = Valid(new Bundle {
       val data = Bits((outFLen).W)

From 6cad8edd1838642cbbb61ef6998c8318d96864e1 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 22:01:02 -0700
Subject: [PATCH 08/27] tensor: Fix operand alignment in pipelining

---
 .../radiance/core/TensorCoreDecoupled.scala   | 56 +++++++++++--------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 69b84f9..0654df3 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -224,37 +224,51 @@ class TensorCoreDecoupled(
   }
   dontTouch(substepExecute)
 
+  // Do pipelining for the A operand so that we obtain the full 4x4 A tile
+  // ready for compute.  The pipeline is two-stage:
+  //   - stage one (halfAQueue) for assembling the full A tile from half-tiles
+  //     coming from the resp queue, and
+  //   - stage two (fullAQueue) for holding the full A tile until it gets
+  //     matched with two 4x2 B tiles, and compute is complete.
+  //
+  // Note that the half-tile assembly is unnecessary for B since the B tile is
+  // only 4x2.
+  // Also send the set/step tag along the pipe for alignment check.
+
   // note combinationally coupled ready with `pipe`
   val halfAQueue = Module(new Queue(
-    chiselTypeOf(respQueueA.bits.data), entries = 1, pipe = true
+    chiselTypeOf(respQueueA.bits), entries = 1, pipe = true
   ))
   halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U)
-  halfAQueue.io.enq.bits := respQueueA.bits.data
+  halfAQueue.io.enq.bits := respQueueA.bits
 
-  // we need the full data for A because we divide the D tile by half along N;
-  // for B, the DPU can immediately start computing with a 4x2 tile.
-  //
   // substep == 0 data goes to the LSB
-  val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits)
+  val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits.data)
+  require(fullAEnqData.widthOption.get == dataWidth * 2,
+          "assumes 2-cycle read for a full compute tile of A")
+  // only use the lower halfA's tag.  substep will be incorrect.
+  val fullAEnqTag = halfAQueue.io.deq.bits.tag
   val fullAQueue = Module(new Queue(
-    chiselTypeOf(fullAEnqData), entries = 1, pipe = true
+    new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
   ))
   // hold first half A data for the first substep
   halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) &&
                              fullAQueue.io.enq.ready
-
-  require(fullAEnqData.widthOption.get == dataWidth * 2,
-          "assumes 2-cycle read for a full compute tile of A")
   fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) &&
                              halfAQueue.io.deq.valid
-  fullAQueue.io.enq.bits := fullAEnqData
+  fullAQueue.io.enq.bits.data := fullAEnqData
+  fullAQueue.io.enq.bits.tag := fullAEnqTag
 
   val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME?
   val dpuFire = operandsValid && dpuReady
-  fullAQueue.io.deq.ready := dpuFire
-  val nextStepExecute = dpuFire
+  val substepCompute = RegInit(0.U(1.W))
+  when (dpuFire) {
+    substepCompute := substepCompute + 1.U
+  }
 
-  // FIXME: need to hold A for two cycles!!
+  // hold full A until two-cycle compute is done
+  fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
+  val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
   // make sure to dequeue from response queues only when both A and B valid
   respQueueA.ready := MuxCase(false.B,
@@ -264,21 +278,17 @@ class TensorCoreDecoupled(
   dontTouch(respQueueA)
   dontTouch(respQueueB)
 
-  // assert that the A and B response queue heads always point to the same
-  // set/step/substep
+  // assert that the DPU is computing with operands of the same set/step
   //
   // this assumes that memory responses come back in-order.  this might be too
   // strong an assumption depending on the backing memory
   def assertAligned = {
-    val bothQueueValid = (respQueueA.valid && respQueueB.valid)
-    when (bothQueueValid && (substepExecute === 0.U)) {
-      assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
-        (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
-        "A and B response queue pointing to different set/steps. " ++
+    when (dpuFire) {
+      assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) &&
+             (fullAQueue.io.deq.bits.tag.step === respQueueB.bits.tag.step),
+        "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")
     }
-    dontTouch(respQueueA.bits.tag)
-    dontTouch(respQueueB.bits.tag)
   }
   assertAligned
 

From 23edc34c7ebc28623a5961abc654d7f4049c4864 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 22:15:35 -0700
Subject: [PATCH 09/27] tensor: Add two TLRAM config for full throughput test

---
 .../radiance/core/TensorCoreDecoupled.scala   | 38 ++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 0654df3..154a3cf 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -155,8 +155,8 @@ class TensorCoreDecoupled(
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
-      (io.reqB, (io.respB, respBTagged))).foreach {
-    case (req, (resp, respTagged)) => {
+      (io.reqB, (io.respB, respBTagged))).zipWithIndex.foreach {
+    case ((req, (resp, respTagged)), i) => {
       val sourceGen = Module(new SourceGenerator(
         log2Ceil(numSourceIds),
         metadata = Some(tag)
@@ -165,7 +165,9 @@ class TensorCoreDecoupled(
       sourceGen.io.gen := req.fire
       sourceGen.io.meta := tag
       req.valid := genReq
-      req.bits.address := 0.U // FIXME
+      // FIXME: bogus address
+      // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B
+      req.bits.address := 0.U
       req.bits.source := sourceGen.io.id.bits
 
       sourceGen.io.reclaim.valid := resp.fire
@@ -270,7 +272,8 @@ class TensorCoreDecoupled(
   fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
   val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
-  // make sure to dequeue from response queues only when both A and B valid
+  // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
+  // on the substep
   respQueueA.ready := MuxCase(false.B,
                               Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
                                   (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
@@ -446,10 +449,35 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
   }
 }
 
+// two separate TLRAMs for A and B for full throughput
+class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
+  val tensor = LazyModule(new TensorCoreDecoupledTL)
+  val xbar = LazyModule(new TLXbar)
+  val ramA = LazyModule(new TLRAM(
+    address = AddressSet(0x000, 0xfffeff),
+    beatBytes = 32 // FIXME: hardcoded
+  ))
+  val ramB = LazyModule(new TLRAM(
+    address = AddressSet(0x100, 0xfffeff),
+    beatBytes = 32 // FIXME: hardcoded
+  ))
+
+  xbar.node :=* tensor.node
+  ramA.node := xbar.node
+  ramB.node := xbar.node
+
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with UnitTestModule {
+    tensor.module.io.start := io.start
+    io.finished := tensor.module.io.finished
+  }
+}
+
 // unit test harness
 class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters)
     extends UnitTest(timeout) {
-  val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
+  // val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
+  val dut = Module(LazyModule(new TensorCoreDecoupledTwoTLRAM).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
 }

From e1e3ac8274bd02954ff4d64ad9462ef4a8bb2f1b Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 22:22:27 -0700
Subject: [PATCH 10/27] tensor: Fix busy state

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 154a3cf..652608b 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -128,7 +128,13 @@ class TensorCoreDecoupled(
       )
     }
   }
-  when(io.writeback.fire) {
+
+  // TODO: @perf: Instead of waiting until the last writeback, release busy as
+  // soon as the access frontend is complete so that there's a better chance to
+  // saturate the backend with back-to-back HGMMAs.  This would require sending
+  // the 'wid' register to backend instead of having it shared with the
+  // frontend.
+  when(io.writeback.fire && io.writeback.bits.last) {
     busy := false.B
   }
 

From 8847278ad1d54fe3167f01e0b9f70fcd3dd01096 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 17 Oct 2024 14:37:33 -0700
Subject: [PATCH 11/27] tensor: Instantiate actual DPU

---
 .../radiance/core/TensorCoreDecoupled.scala   | 93 +++++++++++++++----
 1 file changed, 73 insertions(+), 20 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 652608b..b9695ad 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -33,8 +33,9 @@ class TensorCoreDecoupled(
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
+  val wordSizeInBits = wordSize * 8 // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
-  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
+  val dataWidth = numLanes * wordSizeInBits // TODO FP16
   val numFPRegBits = log2Ceil(numFPRegs)
 
   val io = IO(new Bundle {
@@ -45,7 +46,7 @@ class TensorCoreDecoupled(
       val last = Bool()
       val wid = UInt(numWarpBits.W)
       val rd = UInt(numFPRegBits.W)
-      val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W))
+      val data = Vec(numLanes, UInt((wordSizeInBits).W))
     })
     val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
     val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
@@ -223,9 +224,6 @@ class TensorCoreDecoupled(
           io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
-  // FIXME: this need to change to dpu_ready
-  val dpuReady = io.writeback.ready // FIXME: this need be actual dpu
-
   val substepExecute = RegInit(0.U(1.W))
   when (respQueueA.fire) {
     substepExecute := substepExecute + 1.U
@@ -267,7 +265,10 @@ class TensorCoreDecoupled(
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
 
-  val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME?
+  val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid
+  val operandA = fullAQueue.io.deq.bits.data
+  val operandB = respQueueB.bits.data
+  val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
   val substepCompute = RegInit(0.U(1.W))
   when (dpuFire) {
@@ -301,6 +302,66 @@ class TensorCoreDecoupled(
   }
   assertAligned
 
+  // Dot-product unit
+  //
+  // 4x2 four-element DPUs summing up to 32 MACs in total
+  val dpus = Seq.fill(4)(Seq.fill(2)(
+    Module(new TensorDotProductUnit(half = false))
+  ))
+  // operandA is 4x4 in K-major
+  val operandADimensional =
+    operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    .grouped(4).toSeq
+  println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits")
+  println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}")
+  assert(operandADimensional.length == tilingParams.mc &&
+         operandADimensional(0).length == tilingParams.kc,
+         "operand width doesn't agree with tiling parameter")
+  // operandB is 2x4, i.e. 4x2 in N-major
+  val operandBDimensional =
+    operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    .grouped(4).toSeq
+  println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}")
+  val ncSubstep = tilingParams.nc / 2
+  assert(tilingParams.mc * ncSubstep == numLanes,
+         "substep tile size doesn't match writeback throughput")
+  assert(operandBDimensional.length == ncSubstep &&
+         operandBDimensional(0).length == tilingParams.kc,
+         "operand width doesn't agree with tiling parameter")
+
+  for (m <- 0 until tilingParams.mc) {
+    for (n <- 0 until ncSubstep) {
+      dpus(m)(n).io.in.valid := dpuFire
+      dpus(m)(n).io.in.bits.a := operandADimensional(m)
+      dpus(m)(n).io.in.bits.b := operandBDimensional(n)
+      dpus(m)(n).io.in.bits.c := 0.U // FIXME: bogus accum data
+      // dpu ready couples with writeback backpressure
+      dpus(m)(n).io.stall := !io.writeback.ready
+    }
+  }
+  dpuReady := !dpus(0)(0).io.stall
+  dontTouch(dpuFire)
+  dontTouch(dpuReady)
+
+  val dpuValids = dpus.flatMap(_.map(_.io.out.valid))
+  val dpuValid = dpuValids.reduce(_ && _)
+  def assertDPU = {
+    val dpuStalls = dpus.flatMap(_.map(_.io.stall))
+    assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _),
+      "stall signals of DPUs went unaligned")
+    assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _),
+      "valid signals of DPUs went unaligned")
+  }
+  assertDPU
+
+  // flatten DPU output into 1D array in M-major order
+  val flattenedDPUOut = (0 until ncSubstep).flatMap { n =>
+    (0 until tilingParams.mc).map { m =>
+      dpus(m)(n).io.out.bits.data
+    }
+  }
+  io.writeback.bits.data := flattenedDPUOut
+
   def rdGen(set: UInt, step: UInt): UInt = {
     // each step produces 4x4 output tile, written by 8 threads with 2 regs per
     // thread
@@ -309,19 +370,11 @@ class TensorCoreDecoupled(
     // FIXME: add substep here
   }
 
-  io.writeback.valid := operandsValid // FIXME: bypass logic
+  io.writeback.valid := dpuValid
   io.writeback.bits.wid := warpReg
   io.writeback.bits.rd := rdGen(setExecute, stepExecute)
   io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
 
-  // FIXME: debug dummy: pipe A directly to writeback
-  val groupedRespA = respQueueA.bits.data
-                     .asBools.grouped(wordSize * 8/*bits*/)
-                     .map(VecInit(_).asUInt)
-  (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) =>
-    wb := data
-  }
-
   // State transition
   // ----------------
   //
@@ -400,7 +453,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
 
   val tensor = Module(new TensorCoreDecoupled(
                       8, 8, outer.numSrcIds , TensorTilingParams()))
-  val wordSize = 4 // FIXME: hardcoded
+  val wordSize = 4 // @cleanup: hardcoded
 
   val zip = Seq((outer.node.out(0), tensor.io.reqA),
                 (outer.node.out(1), tensor.io.reqB))
@@ -431,7 +484,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
   tlOutB.d.ready := tensor.io.respB.ready
 
   tensor.io.initiate.valid := io.start
-  tensor.io.initiate.bits.wid := 0.U // FIXME
+  tensor.io.initiate.bits.wid := 0.U // TODO
   tensor.io.writeback.ready := true.B
 
   io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last
@@ -443,7 +496,7 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
   val xbar = LazyModule(new TLXbar)
   val ram = LazyModule(new TLRAM(
     address = AddressSet(0x0000, 0xffffff),
-    beatBytes = 32 // FIXME: hardcoded
+    beatBytes = 32 // @cleanup: hardcoded
   ))
 
   ram.node :=* xbar.node :=* tensor.node
@@ -461,11 +514,11 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
   val xbar = LazyModule(new TLXbar)
   val ramA = LazyModule(new TLRAM(
     address = AddressSet(0x000, 0xfffeff),
-    beatBytes = 32 // FIXME: hardcoded
+    beatBytes = 32 // @cleanup: hardcoded
   ))
   val ramB = LazyModule(new TLRAM(
     address = AddressSet(0x100, 0xfffeff),
-    beatBytes = 32 // FIXME: hardcoded
+    beatBytes = 32 // @cleanup: hardcoded
   ))
 
   xbar.node :=* tensor.node

From 7de8e86d4f04712f90c4457940c02a341b721f76 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 17 Oct 2024 15:18:47 -0700
Subject: [PATCH 12/27] tensor: Sync rd with DPU using a queue

---
 .../radiance/core/TensorCoreDecoupled.scala   | 44 ++++++++++++-------
 src/main/scala/radiance/core/TensorDPU.scala  |  2 +-
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index b9695ad..92a6596 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -270,6 +270,8 @@ class TensorCoreDecoupled(
   val operandB = respQueueB.bits.data
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
+  val setCompute = fullAQueue.io.deq.bits.tag.set
+  val stepCompute = fullAQueue.io.deq.bits.tag.step
   val substepCompute = RegInit(0.U(1.W))
   when (dpuFire) {
     substepCompute := substepCompute + 1.U
@@ -348,9 +350,9 @@ class TensorCoreDecoupled(
   def assertDPU = {
     val dpuStalls = dpus.flatMap(_.map(_.io.stall))
     assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _),
-      "stall signals of DPUs went unaligned")
+      "stall signals of DPUs went out of sync")
     assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _),
-      "valid signals of DPUs went unaligned")
+      "valid signals of DPUs went out of sync")
   }
   assertDPU
 
@@ -362,17 +364,36 @@ class TensorCoreDecoupled(
   }
   io.writeback.bits.data := flattenedDPUOut
 
-  def rdGen(set: UInt, step: UInt): UInt = {
+  // Writeback queues
+  // ----------------
+  // These queues hold metadata needed for writeback in sync with the DPU.
+
+  val queueDepth = 4 // needs to be at least the DPU latency
+  val rdQueue = Module(new Queue(
+    chiselTypeOf(io.writeback.bits.rd), queueDepth
+  ))
+  rdQueue.io.enq.valid := dpuFire
+  rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute)
+  rdQueue.io.deq.ready := io.writeback.fire
+  assert(rdQueue.io.enq.ready === true.B,
+         "rd queue full, throttling DPU operation")
+  assert(!dpuValid || rdQueue.io.deq.valid,
+         "rd queue and DPU went out of sync")
+
+  // TODO: decouple wid from frontend
+  // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
+
+  // note rd is independent to sets
+  def rdGen(step: UInt, substep: UInt): UInt = {
     // each step produces 4x4 output tile, written by 8 threads with 2 regs per
     // thread
-    require(numLanes == 8, "currently assumes 8-wide warps")
-    (Cat(set, step) >> 1/*2 regs/thread*/)
-    // FIXME: add substep here
+    (step << 1/*2 substeps*/) + substep
   }
 
   io.writeback.valid := dpuValid
   io.writeback.bits.wid := warpReg
-  io.writeback.bits.rd := rdGen(setExecute, stepExecute)
+  io.writeback.bits.rd := rdQueue.io.deq.bits
+  // FIXME: look at set/step of dpu output not setExecute
   io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
 
   // State transition
@@ -410,15 +431,6 @@ class TensorCoreDecoupled(
       }
     }
   }
-
-  // Writeback queues
-  // ----------------
-  // These queues hold the metadata necessary for register
-  // writeback.
-
-  // val queueDepth = 2
-  // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
-  // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 }
 
 // synthesizable unit tests
diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala
index a82bed7..515b1bf 100644
--- a/src/main/scala/radiance/core/TensorDPU.scala
+++ b/src/main/scala/radiance/core/TensorDPU.scala
@@ -53,7 +53,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
   io.out.bits.data := ieee(box(dpu.io.out.bits.data, S))
 }
 
-// Copied from chisel3.util.Pipe.
+// An implementation of chisel3.util.Pipe that supports stalls.
 class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module {
   /** A non-ambiguous name of this `StallingPipe` for use in generated Verilog
    *  names. Includes the latency cycle count in the name as well as the

From 2741af0b2b36026cfe57ca227eb469d6643d4c12 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 17 Oct 2024 15:43:44 -0700
Subject: [PATCH 13/27] tensor: Keep set/step in the tag writeback queue

---
 .../radiance/core/TensorCoreDecoupled.scala   | 39 +++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 92a6596..3d00c35 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -267,6 +267,7 @@ class TensorCoreDecoupled(
 
   val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid
   val operandA = fullAQueue.io.deq.bits.data
+  val operandATag = fullAQueue.io.deq.bits.tag
   val operandB = respQueueB.bits.data
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
@@ -314,8 +315,6 @@ class TensorCoreDecoupled(
   val operandADimensional =
     operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4).toSeq
-  println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits")
-  println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}")
   assert(operandADimensional.length == tilingParams.mc &&
          operandADimensional(0).length == tilingParams.kc,
          "operand width doesn't agree with tiling parameter")
@@ -323,7 +322,6 @@ class TensorCoreDecoupled(
   val operandBDimensional =
     operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4).toSeq
-  println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}")
   val ncSubstep = tilingParams.nc / 2
   assert(tilingParams.mc * ncSubstep == numLanes,
          "substep tile size doesn't match writeback throughput")
@@ -369,18 +367,20 @@ class TensorCoreDecoupled(
   // These queues hold metadata needed for writeback in sync with the DPU.
 
   val queueDepth = 4 // needs to be at least the DPU latency
-  val rdQueue = Module(new Queue(
-    chiselTypeOf(io.writeback.bits.rd), queueDepth
+  val tagQueue = Module(new Queue(
+    chiselTypeOf(operandATag), queueDepth
   ))
-  rdQueue.io.enq.valid := dpuFire
-  rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute)
-  rdQueue.io.deq.ready := io.writeback.fire
-  assert(rdQueue.io.enq.ready === true.B,
-         "rd queue full, throttling DPU operation")
-  assert(!dpuValid || rdQueue.io.deq.valid,
-         "rd queue and DPU went out of sync")
+  tagQueue.io.enq.valid := dpuFire
+  // A and B should have the same tags
+  tagQueue.io.enq.bits := operandATag
+  // @cleanup: awkward
+  tagQueue.io.enq.bits.substep := substepCompute
+  tagQueue.io.deq.ready := io.writeback.fire
+  assert(tagQueue.io.enq.ready === true.B,
+         "tag queue full, DPU operation might be throttled")
+  assert(!dpuValid || tagQueue.io.deq.valid,
+         "tag queue and DPU went out of sync")
 
-  // TODO: decouple wid from frontend
   // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 
   // note rd is independent to sets
@@ -390,11 +390,14 @@ class TensorCoreDecoupled(
     (step << 1/*2 substeps*/) + substep
   }
 
+  val setWriteback = tagQueue.io.deq.bits.set
+  val stepWriteback = tagQueue.io.deq.bits.step
+  val substepWriteback = tagQueue.io.deq.bits.substep
   io.writeback.valid := dpuValid
+  // TODO: decouple wid from frontend
   io.writeback.bits.wid := warpReg
-  io.writeback.bits.rd := rdQueue.io.deq.bits
-  // FIXME: look at set/step of dpu output not setExecute
-  io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
+  io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback)
+  io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback)
 
   // State transition
   // ----------------
@@ -500,6 +503,10 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
   tensor.io.writeback.ready := true.B
 
   io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last
+  when (io.finished) {
+    // might be too strong
+    assert(tensor.io.writeback.bits.rd === 31.U)
+  }
 }
 
 // a minimal Diplomacy graph with a tensor core and a TLRAM

From a2519da58fe1397a7570656a4726f42693e8d845 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 17 Oct 2024 16:36:18 -0700
Subject: [PATCH 14/27] tensor: SMEM address generation

---
 .../radiance/core/TensorCoreDecoupled.scala   | 51 +++++++++++++++++--
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 3d00c35..f7c8547 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -159,6 +159,48 @@ class TensorCoreDecoupled(
   tag.step := stepAccess
   tag.substep := substepAccess
 
+  // @cleanup: generalize in terms of M/N/K-majorness?
+  def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt)
+      : (UInt/*A*/, UInt/*B*/) = {
+    // note that step iterates along N first, then M
+    val numComputeTilesM = tilingParams.m / tilingParams.mc
+    val numComputeTilesN = tilingParams.n / tilingParams.nc
+    val tileM = step % numComputeTilesM.U
+    val tileN = step / numComputeTilesM.U
+    val mcSubstep  = tilingParams.mc / 2
+    val ncSubstep  = tilingParams.nc / 2
+
+    // note that both A and B are K-major to facilitate bank conflict-free SMEM
+    // accesses
+    //
+    // (row,col) coordinate of the compute tile
+    val tileRowA = tileM // M
+    val tileColA = set   // K
+    val tileRowB = tileN // N
+    val tileColB = set   // K
+    // (row,col) coordinate of the starting element of the compute tile
+    val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) +
+                   (substep << log2Ceil(mcSubstep))
+    val elemColA = tileColA << log2Ceil(tilingParams.kc)
+    val elemRowB = tileRowB << log2Ceil(tilingParams.nc)
+                   (substep << log2Ceil(ncSubstep))
+    val elemColB = tileColB << log2Ceil(tilingParams.kc)
+    val rowStrideA = wordSize * tilingParams.k
+    val rowStrideABits = log2Ceil(rowStrideA)
+    val rowStrideB = wordSize * tilingParams.k
+    val rowStrideBBits = log2Ceil(rowStrideB)
+    val wordStrideBits = log2Ceil(wordSize)
+
+    val tileOffsetA = (elemRowA << rowStrideABits) + (elemColA << wordStrideBits)
+    val tileOffsetB = (elemRowB << rowStrideBBits) + (elemColB << wordStrideBits)
+
+    (baseA + tileOffsetA, baseB + tileOffsetB)
+  }
+
+  // FIXME: bogus base address
+  val (addressA, addressB) =
+    addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess)
+
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
@@ -172,9 +214,7 @@ class TensorCoreDecoupled(
       sourceGen.io.gen := req.fire
       sourceGen.io.meta := tag
       req.valid := genReq
-      // FIXME: bogus address
-      // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B
-      req.bits.address := 0.U
+      req.bits.address := (if (i == 0) addressA else addressB)
       req.bits.source := sourceGen.io.id.bits
 
       sourceGen.io.reclaim.valid := resp.fire
@@ -366,7 +406,7 @@ class TensorCoreDecoupled(
   // ----------------
   // These queues hold metadata needed for writeback in sync with the DPU.
 
-  val queueDepth = 4 // needs to be at least the DPU latency
+  val queueDepth = 6 // needs to be at least the DPU latency
   val tagQueue = Module(new Queue(
     chiselTypeOf(operandATag), queueDepth
   ))
@@ -397,7 +437,8 @@ class TensorCoreDecoupled(
   // TODO: decouple wid from frontend
   io.writeback.bits.wid := warpReg
   io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback)
-  io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback)
+  io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) &&
+                            (substepWriteback === 1.U)
 
   // State transition
   // ----------------

From 64ea48ace3681e0a74a732fb4da006717e62b873 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 13:46:04 -0700
Subject: [PATCH 15/27] tensor: Consider data reuse for B memory request

B is reused every 4 steps because of the k->i->j iteration order.
---
 .../radiance/core/TensorCoreDecoupled.scala   | 111 ++++++++++--------
 1 file changed, 62 insertions(+), 49 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index f7c8547..897edb2 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -145,8 +145,6 @@ class TensorCoreDecoupled(
   // Memory traffic generation
   // -------------------------
   //
-  val genReq = (state === TensorState.run)
-
   class TensorMemTag extends Bundle {
     val set = UInt(setBits.W)
     val step = UInt(stepBits.W)
@@ -159,16 +157,14 @@ class TensorCoreDecoupled(
   tag.step := stepAccess
   tag.substep := substepAccess
 
+  val numTilesM = tilingParams.m / tilingParams.mc
+  val numTilesN = tilingParams.n / tilingParams.nc
   // @cleanup: generalize in terms of M/N/K-majorness?
   def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt)
       : (UInt/*A*/, UInt/*B*/) = {
     // note that step iterates along N first, then M
-    val numComputeTilesM = tilingParams.m / tilingParams.mc
-    val numComputeTilesN = tilingParams.n / tilingParams.nc
-    val tileM = step % numComputeTilesM.U
-    val tileN = step / numComputeTilesM.U
-    val mcSubstep  = tilingParams.mc / 2
-    val ncSubstep  = tilingParams.nc / 2
+    val tileM = step % numTilesM.U
+    val tileN = step / numTilesM.U
 
     // note that both A and B are K-major to facilitate bank conflict-free SMEM
     // accesses
@@ -180,11 +176,11 @@ class TensorCoreDecoupled(
     val tileColB = set   // K
     // (row,col) coordinate of the starting element of the compute tile
     val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) +
-                   (substep << log2Ceil(mcSubstep))
-    val elemColA = tileColA << log2Ceil(tilingParams.kc)
-    val elemRowB = tileRowB << log2Ceil(tilingParams.nc)
-                   (substep << log2Ceil(ncSubstep))
-    val elemColB = tileColB << log2Ceil(tilingParams.kc)
+                    (substep << log2Ceil(tilingParams.mc / 2))
+    val elemColA =  tileColA << log2Ceil(tilingParams.kc)
+    val elemRowB = (tileRowB << log2Ceil(tilingParams.nc)) +
+                    (substep << log2Ceil(tilingParams.nc / 2))
+    val elemColB =  tileColB << log2Ceil(tilingParams.kc)
     val rowStrideA = wordSize * tilingParams.k
     val rowStrideABits = log2Ceil(rowStrideA)
     val rowStrideB = wordSize * tilingParams.k
@@ -201,6 +197,13 @@ class TensorCoreDecoupled(
   val (addressA, addressB) =
     addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess)
 
+  val genReqA = (state === TensorState.run)
+  val numTilesMBits = log2Ceil(numTilesM)
+  // generate B request at every 4 steps.  B achieves reuse through outer
+  // product so it doesn't require access at every step
+  val shouldFireB = (stepAccess & ((1 << numTilesMBits) - 1).U) === 0.U
+  val genReqB = (state === TensorState.run) && shouldFireB
+
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
@@ -213,7 +216,7 @@ class TensorCoreDecoupled(
 
       sourceGen.io.gen := req.fire
       sourceGen.io.meta := tag
-      req.valid := genReq
+      req.valid := (if (i == 0) genReqA else genReqB)
       req.bits.address := (if (i == 0) addressA else addressB)
       req.bits.source := sourceGen.io.id.bits
 
@@ -228,23 +231,27 @@ class TensorCoreDecoupled(
     }
   }
 
-  // only advance to the next step if we fired mem requests for both A and B
-  // TODO: @perf: too strict? should be able to have A and B progress
-  // separately
-  val firedABReg = RegInit(VecInit(false.B, false.B))
-  val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map {
-    case (req, fired) => { when (req.fire) { fired := true.B } }
-    req.fire
-  })
-  val firedAB = (firedABNow.asUInt | firedABReg.asUInt)
-  val nextSubstepAccess = firedAB.andR
+  // only advance to the next step if we fired mem requests for both A and B.
+  // also consider that B doesn't have to be fired every time due to reuse.
+  // @perf: too strict? should be able to have A and B progress separately
+  val firedAReg = RegInit(false.B)
+  val firedBReg = RegInit(false.B)
+  when (io.reqA.fire) { firedAReg := true.B }
+  when (io.reqB.fire) { firedBReg := true.B }
+  val firedANow = io.reqA.fire
+  val firedBNow = io.reqB.fire
+  val firedA = firedAReg || firedANow
+  val firedB = firedBReg || firedBNow
+  val nextSubstepAccess = firedA && (!shouldFireB || firedB)
   val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U)
   // clear out firedABReg every substep
   when (nextSubstepAccess) {
-    firedABReg := Seq(false.B, false.B)
+    firedAReg := false.B
+    firedBReg := false.B
     substepAccess := substepAccess + 1.U
   }
   require(substepAccess.widthOption.get == 1, "there should be only two substeps")
+  dontTouch(shouldFireB)
 
   // Execute stage
   // -------------
@@ -327,18 +334,26 @@ class TensorCoreDecoupled(
   respQueueA.ready := MuxCase(false.B,
                               Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
                                   (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
-  respQueueB.ready := dpuFire
+  // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
+  // we fully iterated a column (M-dimension).
+  val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
+  val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask
+  respQueueB.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
+  dontTouch(shouldDequeueB)
 
-  // assert that the DPU is computing with operands of the same set/step
+  // Assert that the DPU is computing with operands of the same set/step. Note
+  // that the B resp will only have step values multiple of 4 due to reuse.
   //
-  // this assumes that memory responses come back in-order.  this might be too
-  // strong an assumption depending on the backing memory
+  // This check assumes that memory responses come back in-order.  Might be too
+  // strong of an assumption depending on the backing memory.
   def assertAligned = {
+    val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
       assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) &&
-             (fullAQueue.io.deq.bits.tag.step === respQueueB.bits.tag.step),
+             ((fullAQueue.io.deq.bits.tag.step & stepMask) ===
+              (respQueueB.bits.tag.step & stepMask)),
         "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")
     }
@@ -348,26 +363,26 @@ class TensorCoreDecoupled(
   // Dot-product unit
   //
   // 4x2 four-element DPUs summing up to 32 MACs in total
-  val dpus = Seq.fill(4)(Seq.fill(2)(
+  val ncSubstep = tilingParams.nc / 2
+  val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)(
     Module(new TensorDotProductUnit(half = false))
   ))
   // operandA is 4x4 in K-major
   val operandADimensional =
     operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-    .grouped(4).toSeq
-  assert(operandADimensional.length == tilingParams.mc &&
-         operandADimensional(0).length == tilingParams.kc,
-         "operand width doesn't agree with tiling parameter")
-  // operandB is 2x4, i.e. 4x2 in N-major
+    .grouped(4/*k-dim*/).toSeq
+  require(operandADimensional.length == tilingParams.mc &&
+          operandADimensional(0).length == tilingParams.kc,
+          "operand width doesn't agree with tiling parameter")
+  // operandB is 2x4 in K-major
   val operandBDimensional =
     operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-    .grouped(4).toSeq
-  val ncSubstep = tilingParams.nc / 2
-  assert(tilingParams.mc * ncSubstep == numLanes,
-         "substep tile size doesn't match writeback throughput")
-  assert(operandBDimensional.length == ncSubstep &&
-         operandBDimensional(0).length == tilingParams.kc,
-         "operand width doesn't agree with tiling parameter")
+    .grouped(4/*k-dim*/).toSeq
+  require(tilingParams.mc * ncSubstep == numLanes,
+          "substep tile size doesn't match writeback throughput")
+  require(operandBDimensional.length == ncSubstep &&
+          operandBDimensional(0).length == tilingParams.kc,
+          "operand width doesn't agree with tiling parameter")
 
   for (m <- 0 until tilingParams.mc) {
     for (n <- 0 until ncSubstep) {
@@ -406,10 +421,8 @@ class TensorCoreDecoupled(
   // ----------------
   // These queues hold metadata needed for writeback in sync with the DPU.
 
-  val queueDepth = 6 // needs to be at least the DPU latency
-  val tagQueue = Module(new Queue(
-    chiselTypeOf(operandATag), queueDepth
-  ))
+  val queueDepth = 5 // needs to be at least the DPU latency
+  val tagQueue = Module(new Queue(chiselTypeOf(operandATag), queueDepth))
   tagQueue.io.enq.valid := dpuFire
   // A and B should have the same tags
   tagQueue.io.enq.bits := operandATag
@@ -573,11 +586,11 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
   val tensor = LazyModule(new TensorCoreDecoupledTL)
   val xbar = LazyModule(new TLXbar)
   val ramA = LazyModule(new TLRAM(
-    address = AddressSet(0x000, 0xfffeff),
+    address = AddressSet(0x000, 0xfffbff),
     beatBytes = 32 // @cleanup: hardcoded
   ))
   val ramB = LazyModule(new TLRAM(
-    address = AddressSet(0x100, 0xfffeff),
+    address = AddressSet(0x400, 0xfffbff),
     beatBytes = 32 // @cleanup: hardcoded
   ))
 

From c2f39f74749df7fac8ba63d8900d6651eea72f71 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 16:21:43 -0700
Subject: [PATCH 16/27] tensor: Rename substepExecute

---
 .../radiance/core/TensorCoreDecoupled.scala    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 897edb2..f7c6c63 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -269,13 +269,13 @@ class TensorCoreDecoupled(
 
   require(respQueueA.bits.data.widthOption.get ==
           io.writeback.bits.data.widthOption.get,
-    "response data width does not match the writeback data width")
+          "response data width does not match the writeback data width")
 
-  val substepExecute = RegInit(0.U(1.W))
+  val substepDeqA = RegInit(0.U(1.W))
   when (respQueueA.fire) {
-    substepExecute := substepExecute + 1.U
+    substepDeqA := substepDeqA + 1.U
   }
-  dontTouch(substepExecute)
+  dontTouch(substepDeqA)
 
   // Do pipelining for the A operand so that we obtain the full 4x4 A tile
   // ready for compute.  The pipeline is two-stage:
@@ -292,7 +292,7 @@ class TensorCoreDecoupled(
   val halfAQueue = Module(new Queue(
     chiselTypeOf(respQueueA.bits), entries = 1, pipe = true
   ))
-  halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U)
+  halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U)
   halfAQueue.io.enq.bits := respQueueA.bits
 
   // substep == 0 data goes to the LSB
@@ -305,9 +305,9 @@ class TensorCoreDecoupled(
     new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
   ))
   // hold first half A data for the first substep
-  halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) &&
+  halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) &&
                              fullAQueue.io.enq.ready
-  fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) &&
+  fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) &&
                              halfAQueue.io.deq.valid
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
@@ -332,8 +332,8 @@ class TensorCoreDecoupled(
   // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
   // on the substep
   respQueueA.ready := MuxCase(false.B,
-                              Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
-                                  (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
+                              Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready,
+                                  (substepDeqA === 1.U) -> fullAQueue.io.enq.ready))
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U

From 91d9897c277a1f0ab4e678bdd91f21eef7ac380d Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 17:17:41 -0700
Subject: [PATCH 17/27] tensor: Write FillBuffer for tile buffering

---
 .../radiance/core/TensorCoreDecoupled.scala   | 50 +++++++++++++++++--
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index f7c6c63..e70e59f 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -5,6 +5,7 @@ package radiance.core
 
 import chisel3._
 import chisel3.util._
+import chisel3.experimental.requireIsChiselType
 import org.chipsalliance.cde.config.Parameters
 import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
 import freechips.rocketchip.tilelink._
@@ -312,10 +313,17 @@ class TensorCoreDecoupled(
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
 
-  val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid
+  val fillBufB = Module(new FillBuffer(
+    chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
+  ))
+  fillBufB.io.enq.valid := respQueueB.valid
+  fillBufB.io.enq.bits := respQueueB.bits.data
+  respQueueB.ready := fillBufB.io.enq.ready
+
+  val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid
   val operandA = fullAQueue.io.deq.bits.data
   val operandATag = fullAQueue.io.deq.bits.tag
-  val operandB = respQueueB.bits.data
+  val operandB = fillBufB.io.deq.bits
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
   val setCompute = fullAQueue.io.deq.bits.tag.set
@@ -338,7 +346,7 @@ class TensorCoreDecoupled(
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
   val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask
-  respQueueB.ready := dpuFire && shouldDequeueB
+  fillBufB.io.deq.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
   dontTouch(shouldDequeueB)
@@ -375,8 +383,11 @@ class TensorCoreDecoupled(
           operandADimensional(0).length == tilingParams.kc,
           "operand width doesn't agree with tiling parameter")
   // operandB is 2x4 in K-major
+  // val operandBDimensional =
+  //   operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+  //   .grouped(4/*k-dim*/).toSeq
   val operandBDimensional =
-    operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4/*k-dim*/).toSeq
   require(tilingParams.mc * ncSubstep == numLanes,
           "substep tile size doesn't match writeback throughput")
@@ -490,6 +501,37 @@ class TensorCoreDecoupled(
   }
 }
 
+// A buffer that collects multiple entries of input data and exposes the
+// coalesced data as output.  Effectively acts as a width-widening
+// chisel.util.Pipe.
+class FillBuffer[T <: Data](
+  gen: T,
+  entries: Int
+) extends Module {
+  require(entries > 0, "FillBuffer must have a positive number of entries")
+  requireIsChiselType(gen)
+
+  val io = IO(new Bundle {
+    val enq = Flipped(Decoupled(gen))
+    val deq = Decoupled(Vec(entries, gen))
+  })
+
+  val data = Reg(Vec(entries, gen))
+  val ptr = Counter(entries + 1)
+  val full = (ptr.value === entries.U)
+  io.enq.ready := !full
+  when (io.enq.fire) {
+    data(ptr.value) := io.enq.bits
+    ptr.inc()
+  }
+  io.deq.valid := full
+  (io.deq.bits zip data).foreach { case (io, d) => io := d }
+  when (io.deq.fire) {
+    assert(ptr.value === entries.U, "FillBuffer fired before buffer was full")
+    ptr.reset()
+  }
+}
+
 // synthesizable unit tests
 
 // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy

From 7fab6f89ad3e99de20e4aa5972be745c720b1e70 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 17:33:55 -0700
Subject: [PATCH 18/27] tensor: Properly route FillBuffer to DPU

---
 .../radiance/core/TensorCoreDecoupled.scala   | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index e70e59f..206250e 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -313,17 +313,24 @@ class TensorCoreDecoupled(
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
 
-  val fillBufB = Module(new FillBuffer(
+  // serialize every two B responses into one full 4x4 B tile
+  // FIXME: do the same for A
+  val fullB = Module(new FillBuffer(
     chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
   ))
-  fillBufB.io.enq.valid := respQueueB.valid
-  fillBufB.io.enq.bits := respQueueB.bits.data
-  respQueueB.ready := fillBufB.io.enq.ready
+  fullB.io.enq.valid := respQueueB.valid
+  fullB.io.enq.bits := respQueueB.bits.data
+  respQueueB.ready := fullB.io.enq.ready
+  val fullBTag = Module(new Queue(
+    new TensorMemTag, entries = 1, pipe = true
+  ))
+  fullBTag.io.enq.valid := respQueueB.valid
+  fullBTag.io.enq.bits := respQueueB.bits.tag
 
-  val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid
+  val operandsValid = fullAQueue.io.deq.valid && fullB.io.deq.valid
   val operandA = fullAQueue.io.deq.bits.data
   val operandATag = fullAQueue.io.deq.bits.tag
-  val operandB = fillBufB.io.deq.bits
+  val operandB = fullB.io.deq.bits
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
   val setCompute = fullAQueue.io.deq.bits.tag.set
@@ -333,10 +340,6 @@ class TensorCoreDecoupled(
     substepCompute := substepCompute + 1.U
   }
 
-  // hold full A until two-cycle compute is done
-  fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
-  val nextStepExecute = dpuFire && (substepCompute === 1.U)
-
   // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
   // on the substep
   respQueueA.ready := MuxCase(false.B,
@@ -345,12 +348,19 @@ class TensorCoreDecoupled(
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
-  val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask
-  fillBufB.io.deq.ready := dpuFire && shouldDequeueB
+  val shouldDequeueB =
+    ((stepExecute & shouldDequeueBMask) === shouldDequeueBMask) &&
+    (substepCompute === 1.U)
+  fullB.io.deq.ready := dpuFire && shouldDequeueB
+  fullBTag.io.deq.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
   dontTouch(shouldDequeueB)
 
+  // hold full A until two-cycle compute is done
+  fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
+  val nextStepExecute = dpuFire && (substepCompute === 1.U)
+
   // Assert that the DPU is computing with operands of the same set/step. Note
   // that the B resp will only have step values multiple of 4 due to reuse.
   //
@@ -359,9 +369,9 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) &&
+      assert((fullAQueue.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) &&
              ((fullAQueue.io.deq.bits.tag.step & stepMask) ===
-              (respQueueB.bits.tag.step & stepMask)),
+              (fullBTag.io.deq.bits.step & stepMask)),
         "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")
     }
@@ -387,7 +397,7 @@ class TensorCoreDecoupled(
   //   operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
   //   .grouped(4/*k-dim*/).toSeq
   val operandBDimensional =
-    operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4/*k-dim*/).toSeq
   require(tilingParams.mc * ncSubstep == numLanes,
           "substep tile size doesn't match writeback throughput")

From c4b5a11fdefbbfbe73b765bb1feece25d2a1d3f1 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 19:54:20 -0700
Subject: [PATCH 19/27] tensor: Replace staging logic for A with FillBuffer

---
 .../radiance/core/TensorCoreDecoupled.scala   | 77 ++++++++-----------
 1 file changed, 34 insertions(+), 43 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 206250e..deb4dc1 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -272,46 +272,41 @@ class TensorCoreDecoupled(
           io.writeback.bits.data.widthOption.get,
           "response data width does not match the writeback data width")
 
+  // FIXME: unnecessary
   val substepDeqA = RegInit(0.U(1.W))
   when (respQueueA.fire) {
     substepDeqA := substepDeqA + 1.U
   }
   dontTouch(substepDeqA)
 
-  // Do pipelining for the A operand so that we obtain the full 4x4 A tile
-  // ready for compute.  The pipeline is two-stage:
-  //   - stage one (halfAQueue) for assembling the full A tile from half-tiles
-  //     coming from the resp queue, and
-  //   - stage two (fullAQueue) for holding the full A tile until it gets
-  //     matched with two 4x2 B tiles, and compute is complete.
-  //
-  // Note that the half-tile assembly is unnecessary for B since the B tile is
-  // only 4x2.
-  // Also send the set/step tag along the pipe for alignment check.
+  // Stage the operands in a pipeline so that we obtain the full 4x4 tiles
+  // ready for compute.  Also send the set/step tag along the pipe for
+  // alignment check.
 
-  // note combinationally coupled ready with `pipe`
-  val halfAQueue = Module(new Queue(
-    chiselTypeOf(respQueueA.bits), entries = 1, pipe = true
+  val fullA = Module(new FillBuffer(
+    chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
   ))
-  halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U)
-  halfAQueue.io.enq.bits := respQueueA.bits
+  fullA.io.enq.valid := respQueueA.valid
+  fullA.io.enq.bits := respQueueA.bits.data
+  respQueueA.ready := fullA.io.enq.ready
+  // `pipe` combinationally couples enq-deq ready
+  val fullATag = Module(new Queue(
+    new TensorMemTag, entries = 1, pipe = true
+  ))
+  fullATag.io.enq.valid := respQueueA.valid
+  fullATag.io.enq.bits := respQueueA.bits.tag
 
-  // substep == 0 data goes to the LSB
-  val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits.data)
-  require(fullAEnqData.widthOption.get == dataWidth * 2,
-          "assumes 2-cycle read for a full compute tile of A")
-  // only use the lower halfA's tag.  substep will be incorrect.
-  val fullAEnqTag = halfAQueue.io.deq.bits.tag
-  val fullAQueue = Module(new Queue(
+  // stage the full A tile once more so that FillBuffer can be filled up in the
+  // background while the tile is being used for compute.  This does come with
+  // capacity overhead.
+  val fullABuf = Module(new Queue(
     new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
   ))
-  // hold first half A data for the first substep
-  halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) &&
-                             fullAQueue.io.enq.ready
-  fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) &&
-                             halfAQueue.io.deq.valid
-  fullAQueue.io.enq.bits.data := fullAEnqData
-  fullAQueue.io.enq.bits.tag := fullAEnqTag
+  fullABuf.io.enq.valid := fullA.io.deq.valid
+  fullABuf.io.enq.bits.data := fullA.io.deq.bits.asUInt
+  fullABuf.io.enq.bits.tag := fullATag.io.deq.bits
+  fullA.io.deq.ready := fullABuf.io.enq.ready
+  fullATag.io.deq.ready := fullABuf.io.enq.ready
 
   // serialize every two B responses into one full 4x4 B tile
   // FIXME: do the same for A
@@ -327,29 +322,24 @@ class TensorCoreDecoupled(
   fullBTag.io.enq.valid := respQueueB.valid
   fullBTag.io.enq.bits := respQueueB.bits.tag
 
-  val operandsValid = fullAQueue.io.deq.valid && fullB.io.deq.valid
-  val operandA = fullAQueue.io.deq.bits.data
-  val operandATag = fullAQueue.io.deq.bits.tag
+  val operandsValid = fullABuf.io.deq.valid && fullB.io.deq.valid
+  val operandA = fullABuf.io.deq.bits.data
+  val operandATag = fullABuf.io.deq.bits.tag
   val operandB = fullB.io.deq.bits
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
-  val setCompute = fullAQueue.io.deq.bits.tag.set
-  val stepCompute = fullAQueue.io.deq.bits.tag.step
+  val setCompute = fullABuf.io.deq.bits.tag.set
+  val stepCompute = fullABuf.io.deq.bits.tag.step
   val substepCompute = RegInit(0.U(1.W))
   when (dpuFire) {
     substepCompute := substepCompute + 1.U
   }
 
-  // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
-  // on the substep
-  respQueueA.ready := MuxCase(false.B,
-                              Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready,
-                                  (substepDeqA === 1.U) -> fullAQueue.io.enq.ready))
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
   val shouldDequeueB =
-    ((stepExecute & shouldDequeueBMask) === shouldDequeueBMask) &&
+    ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) &&
     (substepCompute === 1.U)
   fullB.io.deq.ready := dpuFire && shouldDequeueB
   fullBTag.io.deq.ready := dpuFire && shouldDequeueB
@@ -358,7 +348,8 @@ class TensorCoreDecoupled(
   dontTouch(shouldDequeueB)
 
   // hold full A until two-cycle compute is done
-  fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
+  fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U)
+  // FIXME: this should be nextStepCompute
   val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
   // Assert that the DPU is computing with operands of the same set/step. Note
@@ -369,8 +360,8 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert((fullAQueue.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) &&
-             ((fullAQueue.io.deq.bits.tag.step & stepMask) ===
+      assert((fullABuf.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) &&
+             ((fullABuf.io.deq.bits.tag.step & stepMask) ===
               (fullBTag.io.deq.bits.step & stepMask)),
         "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")

From 93c9bcc32f5b516f3bd51990ff60e22e0348f409 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 20:12:15 -0700
Subject: [PATCH 20/27] tensor: Stage B as well for full throughput

---
 .../radiance/core/TensorCoreDecoupled.scala   | 41 ++++++++++++-------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index deb4dc1..90cb785 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -300,10 +300,13 @@ class TensorCoreDecoupled(
   // background while the tile is being used for compute.  This does come with
   // capacity overhead.
   val fullABuf = Module(new Queue(
-    new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
+    new Bundle {
+      val data = chiselTypeOf(fullA.io.deq.bits)
+      val tag = new TensorMemTag
+    }, entries = 1, pipe = true
   ))
   fullABuf.io.enq.valid := fullA.io.deq.valid
-  fullABuf.io.enq.bits.data := fullA.io.deq.bits.asUInt
+  fullABuf.io.enq.bits.data := fullA.io.deq.bits
   fullABuf.io.enq.bits.tag := fullATag.io.deq.bits
   fullA.io.deq.ready := fullABuf.io.enq.ready
   fullATag.io.deq.ready := fullABuf.io.enq.ready
@@ -322,10 +325,22 @@ class TensorCoreDecoupled(
   fullBTag.io.enq.valid := respQueueB.valid
   fullBTag.io.enq.bits := respQueueB.bits.tag
 
-  val operandsValid = fullABuf.io.deq.valid && fullB.io.deq.valid
+  val fullBBuf = Module(new Queue(
+    new Bundle {
+      val data = chiselTypeOf(fullB.io.deq.bits)
+      val tag = new TensorMemTag
+    }, entries = 1, pipe = true
+  ))
+  fullBBuf.io.enq.valid := fullB.io.deq.valid
+  fullBBuf.io.enq.bits.data := fullB.io.deq.bits
+  fullBBuf.io.enq.bits.tag := fullBTag.io.deq.bits
+  fullB.io.deq.ready := fullBBuf.io.enq.ready
+  fullBTag.io.deq.ready := fullBBuf.io.enq.ready
+
+  val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid
   val operandA = fullABuf.io.deq.bits.data
   val operandATag = fullABuf.io.deq.bits.tag
-  val operandB = fullB.io.deq.bits
+  val operandB = fullBBuf.io.deq.bits.data
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
   val setCompute = fullABuf.io.deq.bits.tag.set
@@ -335,20 +350,19 @@ class TensorCoreDecoupled(
     substepCompute := substepCompute + 1.U
   }
 
+  // hold full A until two-cycle compute is done
+  fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U)
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
   val shouldDequeueB =
     ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) &&
     (substepCompute === 1.U)
-  fullB.io.deq.ready := dpuFire && shouldDequeueB
-  fullBTag.io.deq.ready := dpuFire && shouldDequeueB
+  fullBBuf.io.deq.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
   dontTouch(shouldDequeueB)
 
-  // hold full A until two-cycle compute is done
-  fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U)
   // FIXME: this should be nextStepCompute
   val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
@@ -360,9 +374,9 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert((fullABuf.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) &&
+      assert((fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set) &&
              ((fullABuf.io.deq.bits.tag.step & stepMask) ===
-              (fullBTag.io.deq.bits.step & stepMask)),
+              (fullBBuf.io.deq.bits.tag.step & stepMask)),
         "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")
     }
@@ -378,15 +392,12 @@ class TensorCoreDecoupled(
   ))
   // operandA is 4x4 in K-major
   val operandADimensional =
-    operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    operandA.asUInt.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4/*k-dim*/).toSeq
   require(operandADimensional.length == tilingParams.mc &&
           operandADimensional(0).length == tilingParams.kc,
           "operand width doesn't agree with tiling parameter")
-  // operandB is 2x4 in K-major
-  // val operandBDimensional =
-  //   operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-  //   .grouped(4/*k-dim*/).toSeq
+  // select 2x4 subtile out of operandB that is 4x4 in K-major
   val operandBDimensional =
     operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4/*k-dim*/).toSeq

From c0292dd0aa97a8cec3d034b20e2a167f78e54af8 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 21:51:34 -0700
Subject: [PATCH 21/27] tensor: Enlarge operand buffer for A for better SMEM
 reuse

---
 .../radiance/core/TensorCoreDecoupled.scala   | 159 +++++++++++-------
 1 file changed, 100 insertions(+), 59 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 90cb785..fa3f6e9 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -146,18 +146,6 @@ class TensorCoreDecoupled(
   // Memory traffic generation
   // -------------------------
   //
-  class TensorMemTag extends Bundle {
-    val set = UInt(setBits.W)
-    val step = UInt(stepBits.W)
-    val substep = UInt(1.W)
-  }
-  // use concatenation of set/step as the memory request source.  This will get
-  // translated to the actual TL sourcewidth in sourceGen.
-  val tag = Wire(new TensorMemTag)
-  tag.set := setAccess
-  tag.step := stepAccess
-  tag.substep := substepAccess
-
   val numTilesM = tilingParams.m / tilingParams.mc
   val numTilesN = tilingParams.n / tilingParams.nc
   // @cleanup: generalize in terms of M/N/K-majorness?
@@ -198,12 +186,41 @@ class TensorCoreDecoupled(
   val (addressA, addressB) =
     addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess)
 
+  // 'index' is the index of a memory request among the sequence of requests
+  // needed to read a full M-column of A or N-row of B.  Its range is [0,m/2)
+  // or [0,n/2), where 2 is the stride can be read in a single request size.
+  require(tilingParams.m == tilingParams.n,
+          "currently only supports square SMEM tile")
+  val numIndices = tilingParams.m / 2
+  val indexBits = log2Ceil(numIndices)
+  val lastIndex = (1 << indexBits) - 1
+
+  class TensorMemTag extends Bundle {
+    val set = UInt(setBits.W)
+    val index = UInt(indexBits.W)
+  }
+
+  val tagInit = Wire(new TensorMemTag)
+  tagInit.set := 0.U
+  tagInit.index := 0.U
+  val tagA = RegInit(tagInit)
+  val tagB = RegInit(tagInit)
+
+  when (io.reqA.fire) {
+    when (tagA.index === lastIndex.U) {
+      tagA.set := tagA.set + 1.U
+    }
+    tagA.index := tagA.index + 1.U
+  }
+  when (io.reqB.fire) {
+    when (tagB.index === lastIndex.U) {
+      tagB.set := tagB.set + 1.U
+    }
+    tagB.index := tagB.index + 1.U
+  }
+
   val genReqA = (state === TensorState.run)
-  val numTilesMBits = log2Ceil(numTilesM)
-  // generate B request at every 4 steps.  B achieves reuse through outer
-  // product so it doesn't require access at every step
-  val shouldFireB = (stepAccess & ((1 << numTilesMBits) - 1).U) === 0.U
-  val genReqB = (state === TensorState.run) && shouldFireB
+  val genReqB = (state === TensorState.run)
 
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
@@ -212,11 +229,11 @@ class TensorCoreDecoupled(
     case ((req, (resp, respTagged)), i) => {
       val sourceGen = Module(new SourceGenerator(
         log2Ceil(numSourceIds),
-        metadata = Some(tag)
+        metadata = Some(new TensorMemTag)
       ))
 
       sourceGen.io.gen := req.fire
-      sourceGen.io.meta := tag
+      sourceGen.io.meta := (if (i == 0) tagA else tagB)
       req.valid := (if (i == 0) genReqA else genReqB)
       req.bits.address := (if (i == 0) addressA else addressB)
       req.bits.source := sourceGen.io.id.bits
@@ -243,7 +260,7 @@ class TensorCoreDecoupled(
   val firedBNow = io.reqB.fire
   val firedA = firedAReg || firedANow
   val firedB = firedBReg || firedBNow
-  val nextSubstepAccess = firedA && (!shouldFireB || firedB)
+  val nextSubstepAccess = firedA && firedB
   val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U)
   // clear out firedABReg every substep
   when (nextSubstepAccess) {
@@ -252,17 +269,12 @@ class TensorCoreDecoupled(
     substepAccess := substepAccess + 1.U
   }
   require(substepAccess.widthOption.get == 1, "there should be only two substeps")
-  dontTouch(shouldFireB)
 
   // Execute stage
   // -------------
   // Backend of the decoupled access/execute pipeline.
   //
   // set and step being currently executed in the acc/ex backend
-  val setExecute = RegInit(0.U(setBits.W))
-  val stepExecute = RegInit(0.U(stepBits.W))
-  dontTouch(setExecute)
-  dontTouch(stepExecute)
 
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
@@ -283,8 +295,10 @@ class TensorCoreDecoupled(
   // ready for compute.  Also send the set/step tag along the pipe for
   // alignment check.
 
+  // @cleanup: dedup A and B below
+
   val fullA = Module(new FillBuffer(
-    chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
+    chiselTypeOf(respQueueB.bits.data), numIndices
   ))
   fullA.io.enq.valid := respQueueA.valid
   fullA.io.enq.bits := respQueueA.bits.data
@@ -337,23 +351,48 @@ class TensorCoreDecoupled(
   fullB.io.deq.ready := fullBBuf.io.enq.ready
   fullBTag.io.deq.ready := fullBBuf.io.enq.ready
 
-  val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid
-  val operandA = fullABuf.io.deq.bits.data
-  val operandATag = fullABuf.io.deq.bits.tag
-  val operandB = fullBBuf.io.deq.bits.data
   val dpuReady = Wire(Bool())
+  val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid
   val dpuFire = operandsValid && dpuReady
-  val setCompute = fullABuf.io.deq.bits.tag.set
-  val stepCompute = fullABuf.io.deq.bits.tag.step
+
+  val setCompute = RegInit(0.U(setBits.W))
+  val stepCompute = RegInit(0.U(stepBits.W))
   val substepCompute = RegInit(0.U(1.W))
+  val nextStepCompute = dpuFire && (substepCompute === 1.U)
+  dontTouch(setCompute)
+  dontTouch(stepCompute)
+  dontTouch(substepCompute)
   when (dpuFire) {
     substepCompute := substepCompute + 1.U
   }
 
-  // hold full A until two-cycle compute is done
-  fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U)
-  // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
-  // we fully iterated a column (M-dimension).
+  // Operand selection
+  //
+  // select the correct 4x4 tile from A operand buffer
+  val numTilesMBits = log2Ceil(numTilesM)
+  def selectOperandA(buf: Vec[UInt]): UInt = {
+    require(buf.length == numIndices)
+    val stepM = stepCompute & ((1 << numTilesMBits) - 1).U
+    Cat(buf((stepM << 1) + 1.U), buf(stepM << 1))
+  }
+  val operandA = selectOperandA(fullABuf.io.deq.bits.data)
+  val operandATag = fullABuf.io.deq.bits.tag
+  // select the correct 2x4 tile from B operand buffer
+  val operandB = fullBBuf.io.deq.bits.data(substepCompute)
+  val operandBTag = fullBBuf.io.deq.bits.tag
+  dontTouch(operandATag)
+  dontTouch(operandBTag)
+
+  // Operand buffer dequeue logic
+  //
+  // hold A data until the entire set is done
+  val shouldDequeueAMask = ((1 << stepBits) - 1).U
+  val shouldDequeueA =
+    ((stepCompute & shouldDequeueAMask) === shouldDequeueAMask) &&
+    (substepCompute === 1.U)
+  fullABuf.io.deq.ready := dpuFire && shouldDequeueA
+  // hold B tile at respQueueB for multiple steps for reuse, only dequeue when
+  // we fully iterated a column (M-dimension)
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
   val shouldDequeueB =
     ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) &&
@@ -361,11 +400,9 @@ class TensorCoreDecoupled(
   fullBBuf.io.deq.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
+  dontTouch(shouldDequeueA)
   dontTouch(shouldDequeueB)
 
-  // FIXME: this should be nextStepCompute
-  val nextStepExecute = dpuFire && (substepCompute === 1.U)
-
   // Assert that the DPU is computing with operands of the same set/step. Note
   // that the B resp will only have step values multiple of 4 due to reuse.
   //
@@ -374,11 +411,9 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert((fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set) &&
-             ((fullABuf.io.deq.bits.tag.step & stepMask) ===
-              (fullBBuf.io.deq.bits.tag.step & stepMask)),
-        "A and B operands are pointing to different set/steps. " ++
-        "This might indicate memory response coming back out-of-order.")
+      assert(fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set,
+             "A and B operands are pointing to different sets. " ++
+             "This might indicate memory response coming back out-of-order.")
     }
   }
   assertAligned
@@ -386,23 +421,24 @@ class TensorCoreDecoupled(
   // Dot-product unit
   //
   // 4x2 four-element DPUs summing up to 32 MACs in total
+  //
   val ncSubstep = tilingParams.nc / 2
+  require(tilingParams.mc * ncSubstep == numLanes,
+          "substep tile size doesn't match writeback throughput")
   val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)(
     Module(new TensorDotProductUnit(half = false))
   ))
-  // operandA is 4x4 in K-major
-  val operandADimensional =
-    operandA.asUInt.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-    .grouped(4/*k-dim*/).toSeq
+
+  // reshape operands for easier routing to DPU
+  def reshapeByFourWords(x: UInt): Seq[Seq[UInt]] = {
+    x.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+     .grouped(4/*k-dim*/).toSeq
+  }
+  val operandADimensional = reshapeByFourWords(operandA)
   require(operandADimensional.length == tilingParams.mc &&
           operandADimensional(0).length == tilingParams.kc,
           "operand width doesn't agree with tiling parameter")
-  // select 2x4 subtile out of operandB that is 4x4 in K-major
-  val operandBDimensional =
-    operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-    .grouped(4/*k-dim*/).toSeq
-  require(tilingParams.mc * ncSubstep == numLanes,
-          "substep tile size doesn't match writeback throughput")
+  val operandBDimensional = reshapeByFourWords(operandB)
   require(operandBDimensional.length == ncSubstep &&
           operandBDimensional(0).length == tilingParams.kc,
           "operand width doesn't agree with tiling parameter")
@@ -444,12 +480,17 @@ class TensorCoreDecoupled(
   // ----------------
   // These queues hold metadata needed for writeback in sync with the DPU.
 
+  class TensorComputeTag extends Bundle {
+    val set = UInt(setBits.W)
+    val step = UInt(stepBits.W)
+    val substep = UInt(1.W)
+  }
+
   val queueDepth = 5 // needs to be at least the DPU latency
-  val tagQueue = Module(new Queue(chiselTypeOf(operandATag), queueDepth))
+  val tagQueue = Module(new Queue(new TensorComputeTag, queueDepth))
   tagQueue.io.enq.valid := dpuFire
-  // A and B should have the same tags
-  tagQueue.io.enq.bits := operandATag
-  // @cleanup: awkward
+  tagQueue.io.enq.bits.set := setCompute
+  tagQueue.io.enq.bits.step := stepCompute
   tagQueue.io.enq.bits.substep := substepCompute
   tagQueue.io.deq.ready := io.writeback.fire
   assert(tagQueue.io.enq.ready === true.B,
@@ -490,7 +531,7 @@ class TensorCoreDecoupled(
     }
   }
   sequenceSetStep(setAccess, stepAccess, nextStepAccess)
-  sequenceSetStep(setExecute, stepExecute, nextStepExecute)
+  sequenceSetStep(setCompute, stepCompute, nextStepCompute)
 
   switch(state) {
     is(TensorState.idle) {

From 0aadc6074ad32fccb13118f8e0915d8b76f2a267 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 22:42:41 -0700
Subject: [PATCH 22/27] tensor: Decouple A and B access states

Get rid of set/stepAccess states and let A and B access progress
independently.
---
 .../radiance/core/TensorCoreDecoupled.scala   | 200 ++++++++----------
 1 file changed, 88 insertions(+), 112 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index fa3f6e9..ed241b5 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -82,15 +82,6 @@ class TensorCoreDecoupled(
   // This drives the overall pipeline of memory requests, dot-product unit
   // operations and regfile writeback.
 
-  object TensorState extends ChiselEnum {
-    val idle = Value(0.U)
-    val run = Value(1.U)
-    // All set/step sequencing is complete and the tensor core is holding the
-    // result data until downstream writeback is ready.
-    // FIXME: is this necessary if writeback is decoupled with queues?
-    val finish = Value(2.U)
-  }
-  val state = RegInit(TensorState.idle)
   val busy = RegInit(false.B)
   // Holds the warp id the core is currently working on.  Note that we only
   // support one outstanding warp request
@@ -107,22 +98,10 @@ class TensorCoreDecoupled(
   def setDone(set: UInt) = (set === lastSet.U)
   def stepDone(step: UInt) = (step === lastStep.U)
 
-  // set and step being currently accessed in the acc/ex frontend
-  val setAccess = RegInit(0.U(setBits.W))
-  val stepAccess = RegInit(0.U(stepBits.W))
-  // we need full 4x4 A tile to fire DPU, but since the memory width is 8
-  // words, we need 2 cycles to read A.  `substep` tells which cycle we're at.
-  val substepAccess = RegInit(0.U(1.W))
-  dontTouch(setAccess)
-  dontTouch(stepAccess)
-  dontTouch(substepAccess)
-
-  when(io.initiate.fire) {
+  when (io.initiate.fire) {
     val wid = io.initiate.bits.wid
     busy := true.B
     warpReg := wid
-    setAccess := 0.U
-    stepAccess := 0.U
     when(io.writeback.fire) {
       assert(
         io.writeback.bits.wid =/= wid,
@@ -143,55 +122,51 @@ class TensorCoreDecoupled(
   // serialize every HGMMA request
   io.initiate.ready := !busy
 
-  // Memory traffic generation
-  // -------------------------
+  // ===========================================================================
+  // Access stage
+  // ===========================================================================
   //
-  val numTilesM = tilingParams.m / tilingParams.mc
-  val numTilesN = tilingParams.n / tilingParams.nc
-  // @cleanup: generalize in terms of M/N/K-majorness?
-  def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt)
-      : (UInt/*A*/, UInt/*B*/) = {
-    // note that step iterates along N first, then M
-    val tileM = step % numTilesM.U
-    val tileN = step / numTilesM.U
+  // Frontend of the decoupled access/execute pipeline.
 
-    // note that both A and B are K-major to facilitate bank conflict-free SMEM
-    // accesses
-    //
-    // (row,col) coordinate of the compute tile
-    val tileRowA = tileM // M
-    val tileColA = set   // K
-    val tileRowB = tileN // N
-    val tileColB = set   // K
-    // (row,col) coordinate of the starting element of the compute tile
-    val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) +
-                    (substep << log2Ceil(tilingParams.mc / 2))
-    val elemColA =  tileColA << log2Ceil(tilingParams.kc)
-    val elemRowB = (tileRowB << log2Ceil(tilingParams.nc)) +
-                    (substep << log2Ceil(tilingParams.nc / 2))
-    val elemColB =  tileColB << log2Ceil(tilingParams.kc)
-    val rowStrideA = wordSize * tilingParams.k
-    val rowStrideABits = log2Ceil(rowStrideA)
-    val rowStrideB = wordSize * tilingParams.k
-    val rowStrideBBits = log2Ceil(rowStrideB)
-    val wordStrideBits = log2Ceil(wordSize)
-
-    val tileOffsetA = (elemRowA << rowStrideABits) + (elemColA << wordStrideBits)
-    val tileOffsetB = (elemRowB << rowStrideBBits) + (elemColB << wordStrideBits)
-
-    (baseA + tileOffsetA, baseB + tileOffsetB)
+  // States
+  //
+  object AccessorState extends ChiselEnum {
+    val idle = Value(0.U)
+    val access = Value(1.U)
+    // All set/step sequencing is complete and the tensor core is holding the
+    // result data until downstream writeback is ready.
+    // FIXME: is this necessary if writeback is decoupled with queues?
+    val finish = Value(2.U)
   }
+  val state = RegInit(AccessorState.idle)
+  val allReqsDone = WireInit(false.B)
+  dontTouch(allReqsDone)
 
-  // FIXME: bogus base address
-  val (addressA, addressB) =
-    addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess)
+  switch(state) {
+    is(AccessorState.idle) {
+      when(io.initiate.fire) {
+        state := AccessorState.access
+      }
+    }
+    is(AccessorState.access) {
+      when (allReqsDone) {
+        state := AccessorState.finish
+      }
+    }
+    is(AccessorState.finish) {
+      // FIXME: decouple writeback
+      when(io.writeback.fire) {
+        state := AccessorState.idle
+      }
+    }
+  }
 
   // 'index' is the index of a memory request among the sequence of requests
   // needed to read a full M-column of A or N-row of B.  Its range is [0,m/2)
   // or [0,n/2), where 2 is the stride can be read in a single request size.
   require(tilingParams.m == tilingParams.n,
           "currently only supports square SMEM tile")
-  val numIndices = tilingParams.m / 2
+  val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/
   val indexBits = log2Ceil(numIndices)
   val lastIndex = (1 << indexBits) - 1
 
@@ -219,9 +194,51 @@ class TensorCoreDecoupled(
     tagB.index := tagB.index + 1.U
   }
 
-  val genReqA = (state === TensorState.run)
-  val genReqB = (state === TensorState.run)
+  // Address generation
+  //
+  def addressGen(base: UInt, set: UInt, index: UInt): UInt = {
+    // note that both A and B are K-major to facilitate bank conflict-free SMEM
+    // accesses, so that below code applies to both.
+    //
+    // (row,col) coordinate of the compute tile
+    val tileRow = index
+    val tileCol = set
+    // (row,col) coordinate of the starting element of the compute tile
+    val elemRow = index << 1
+    val elemCol =  tileCol << log2Ceil(tilingParams.kc)
+    val rowStride = tilingParams.k * wordSize
+    val rowStrideBits = log2Ceil(rowStride)
+    val wordStrideBits = log2Ceil(wordSize)
+    val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits)
 
+    base + tileOffset
+  }
+
+  // FIXME: bogus base address
+  val addressA = addressGen(0.U, tagA.set, tagA.index)
+  val addressB = addressGen(0.U, tagB.set, tagB.index)
+
+  val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U)
+  val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U)
+  val doneReqA = RegInit(false.B)
+  val doneReqB = RegInit(false.B)
+  when (lastReqA && io.reqA.fire) { doneReqA := true.B }
+  when (lastReqB && io.reqB.fire) { doneReqB := true.B }
+  val genReqA = (state === AccessorState.access) && !doneReqA
+  val genReqB = (state === AccessorState.access) && !doneReqA
+  when (state === AccessorState.finish) {
+    doneReqA := false.B
+    doneReqB := false.B
+    tagA.set := 0.U
+    tagA.index := 0.U
+    tagB.set := 0.U
+    tagB.index := 0.U
+  }
+
+  allReqsDone := doneReqA && doneReqB
+
+  // Request generation
+  //
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
@@ -249,34 +266,13 @@ class TensorCoreDecoupled(
     }
   }
 
-  // only advance to the next step if we fired mem requests for both A and B.
-  // also consider that B doesn't have to be fired every time due to reuse.
-  // @perf: too strict? should be able to have A and B progress separately
-  val firedAReg = RegInit(false.B)
-  val firedBReg = RegInit(false.B)
-  when (io.reqA.fire) { firedAReg := true.B }
-  when (io.reqB.fire) { firedBReg := true.B }
-  val firedANow = io.reqA.fire
-  val firedBNow = io.reqB.fire
-  val firedA = firedAReg || firedANow
-  val firedB = firedBReg || firedBNow
-  val nextSubstepAccess = firedA && firedB
-  val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U)
-  // clear out firedABReg every substep
-  when (nextSubstepAccess) {
-    firedAReg := false.B
-    firedBReg := false.B
-    substepAccess := substepAccess + 1.U
-  }
-  require(substepAccess.widthOption.get == 1, "there should be only two substeps")
-
+  // ===========================================================================
   // Execute stage
-  // -------------
+  // ===========================================================================
+  //
   // Backend of the decoupled access/execute pipeline.
   //
-  // set and step being currently executed in the acc/ex backend
-
-  val respQueueDepth = 4 // FIXME: parameterize
+  val respQueueDepth = 8 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
 
@@ -369,6 +365,7 @@ class TensorCoreDecoupled(
   // Operand selection
   //
   // select the correct 4x4 tile from A operand buffer
+  val numTilesM = tilingParams.m / tilingParams.mc
   val numTilesMBits = log2Ceil(numTilesM)
   def selectOperandA(buf: Vec[UInt]): UInt = {
     require(buf.length == numIndices)
@@ -383,7 +380,7 @@ class TensorCoreDecoupled(
   dontTouch(operandATag)
   dontTouch(operandBTag)
 
-  // Operand buffer dequeue logic
+  // Operand buffer logic
   //
   // hold A data until the entire set is done
   val shouldDequeueAMask = ((1 << stepBits) - 1).U
@@ -476,8 +473,8 @@ class TensorCoreDecoupled(
   }
   io.writeback.bits.data := flattenedDPUOut
 
-  // Writeback queues
-  // ----------------
+  // Writeback logic
+  //
   // These queues hold metadata needed for writeback in sync with the DPU.
 
   class TensorComputeTag extends Bundle {
@@ -530,28 +527,7 @@ class TensorCoreDecoupled(
       }
     }
   }
-  sequenceSetStep(setAccess, stepAccess, nextStepAccess)
   sequenceSetStep(setCompute, stepCompute, nextStepCompute)
-
-  switch(state) {
-    is(TensorState.idle) {
-      when(io.initiate.fire) {
-        state := TensorState.run
-      }
-    }
-    is(TensorState.run) {
-      when (setDone(setAccess) && stepDone(stepAccess) && nextStepAccess) {
-        when (state === TensorState.run) {
-          state := TensorState.finish
-        }
-      }
-    }
-    is(TensorState.finish) {
-      when(io.writeback.fire) {
-        state := TensorState.idle
-      }
-    }
-  }
 }
 
 // A buffer that collects multiple entries of input data and exposes the

From e946403d7863abaec61b4baa92da467617d3fe66 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 22:54:48 -0700
Subject: [PATCH 23/27] tensor: Fix typo, reduce resp queue depth

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index ed241b5..b899ce9 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -225,7 +225,7 @@ class TensorCoreDecoupled(
   when (lastReqA && io.reqA.fire) { doneReqA := true.B }
   when (lastReqB && io.reqB.fire) { doneReqB := true.B }
   val genReqA = (state === AccessorState.access) && !doneReqA
-  val genReqB = (state === AccessorState.access) && !doneReqA
+  val genReqB = (state === AccessorState.access) && !doneReqB
   when (state === AccessorState.finish) {
     doneReqA := false.B
     doneReqB := false.B
@@ -272,7 +272,7 @@ class TensorCoreDecoupled(
   //
   // Backend of the decoupled access/execute pipeline.
   //
-  val respQueueDepth = 8 // FIXME: parameterize
+  val respQueueDepth = 2 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
 

From b3c328b1be7bf924fdddc285fcf5181d8f55c6cf Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 23:11:19 -0700
Subject: [PATCH 24/27] tensor: Assert minimum response queue depth with doc

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index b899ce9..cd3bfa4 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -272,7 +272,13 @@ class TensorCoreDecoupled(
   //
   // Backend of the decoupled access/execute pipeline.
   //
-  val respQueueDepth = 2 // FIXME: parameterize
+  val respQueueDepth = 4 // FIXME: parameterize
+  require(respQueueDepth >= 4,
+    "respQueueDepth must be at least 4.  This is because the B operand buffer " ++
+    "is shallower than A's, so the B response queue has to be deep enough to " ++
+    "hold younger requests until A operand buffer becomes valid and the first DPU " ++
+    "fire can happen.  FIXME: make operand buffer report per-subtile valid so " ++
+    "the first compute can happen earlier.")
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
 
@@ -547,6 +553,7 @@ class FillBuffer[T <: Data](
 
   val data = Reg(Vec(entries, gen))
   val ptr = Counter(entries + 1)
+  dontTouch(ptr.value)
   val full = (ptr.value === entries.U)
   io.enq.ready := !full
   when (io.enq.fire) {

From a98cb32343810994737e60446c7b0c5d975a6f37 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 21 Oct 2024 21:56:36 -0700
Subject: [PATCH 25/27] tensor: Inject stalls to A ram for fuzzing

---
 .../radiance/core/TensorCoreDecoupled.scala   | 26 +++++++++++++++++--
 .../scala/radiance/memory/Coalescing.scala    |  3 ++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index cd3bfa4..c53ab81 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -216,7 +216,7 @@ class TensorCoreDecoupled(
 
   // FIXME: bogus base address
   val addressA = addressGen(0.U, tagA.set, tagA.index)
-  val addressB = addressGen(0.U, tagB.set, tagB.index)
+  val addressB = addressGen(0x400.U, tagB.set, tagB.index)
 
   val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U)
   val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U)
@@ -672,14 +672,36 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
     beatBytes = 32 // @cleanup: hardcoded
   ))
 
+  val stutter = new TLIdentityNode
   xbar.node :=* tensor.node
-  ramA.node := xbar.node
+  ramA.node := stutter := xbar.node
   ramB.node := xbar.node
 
+  val fuzz = true
+
   lazy val module = new Impl
   class Impl extends LazyModuleImp(this) with UnitTestModule {
     tensor.module.io.start := io.start
     io.finished := tensor.module.io.finished
+
+    val (tlIn, _) = stutter.in(0)
+    val (tlOut, _) = stutter.out(0)
+    require(stutter.in.length == 1)
+    require(stutter.out.length == 1)
+
+    // inject stalls for fuzzing
+    val incr = Wire(Bool())
+    val (count, _) = Counter(incr, 0x1000)
+    def cond(x: UInt) = (x & ((1 << 3) - 1).U) =/= 0.U
+    val stall = if (fuzz) cond(count) else false.B
+
+    tlOut.a <> tlIn.a
+    tlIn.d <> tlOut.d
+    incr := tlIn.a.fire || stall
+    when (stall) {
+      tlIn.a.ready := false.B
+      tlOut.a.valid := false.B
+    }
   }
 }
 
diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala
index cac5e95..a21daee 100644
--- a/src/main/scala/radiance/memory/Coalescing.scala
+++ b/src/main/scala/radiance/memory/Coalescing.scala
@@ -372,7 +372,8 @@ class SourceGenerator[T <: Data](
       outstanding := outstanding + 1.U
     }
   }.elsewhen(io.reclaim.valid) {
-    assert(outstanding > 0.U)
+    assert(outstanding > 0.U,
+           "Over-reclaim. Did some responses get dropped?")
     outstanding := outstanding - 1.U
   }
   dontTouch(outstanding)

From 408888ae8f0f05364412ddba8246d9adf7502f87 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 21 Oct 2024 22:38:29 -0700
Subject: [PATCH 26/27] tensor: addPath()s for hopper generated chisel

FIXME: SourceGenerator has a name-clash.
---
 src/main/scala/radiance/tile/VortexCore.scala | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala
index d6561e3..26c6989 100644
--- a/src/main/scala/radiance/tile/VortexCore.scala
+++ b/src/main/scala/radiance/tile/VortexCore.scala
@@ -128,7 +128,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
         "NUM_THREADS" -> tile.numLsuLanes
       )
     )
-    with HasBlackBoxResource {
+    with HasBlackBoxResource with HasBlackBoxPath {
   // addResource("/vsrc/vortex/hw/unit_tests/generic_queue/testbench.v")
   // addResource("/vsrc/vortex/hw/unit_tests/VX_divide_tb.v")
   // addResource("/vsrc/vortex/hw/syn/synopsys/models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0_rtl.v")
@@ -398,6 +398,34 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
   addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv")
   addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv")
 //  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh")
+  def addHopperTensorCore = {
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRawFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRecFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/DotProductPipe.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/metadataTable_4x5.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/MulFullRawFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/occupancyTable_4x1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorMemTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue4_TensorMemRespWithTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue5_TensorComputeTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_4x261.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_5x7.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is26_oe8_os24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is47_oe8_os24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundRawFNToRecFN_e8_s24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SimpleTimer.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SourceGenerator.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_2.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorCoreDecoupled.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorDotProductUnit.sv")
+  }
+  addHopperTensorCore
   addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv")
   addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv")
   addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv")

From 0fe2b3b07e5a5210cdb1cb5f92f68596b92ff6fb Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 21 Oct 2024 22:39:28 -0700
Subject: [PATCH 27/27] Bump vortex

---
 src/main/resources/vsrc/vortex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex
index 4dcbc31..0f06afc 160000
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
@@ -1 +1 @@
-Subproject commit 4dcbc31a88915fff35ccefd00c6e753fa5ef135a
+Subproject commit 0f06afc3ef7350e82c008f5f25395abf89879213