From 447977bd89c04fd894135076741aefc868c73f39 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 15:02:08 -0700
Subject: [PATCH 01/47] addResource hopper tensor core

---
 src/main/scala/radiance/tile/VortexCore.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala
index d202aaa..d6561e3 100644
--- a/src/main/scala/radiance/tile/VortexCore.scala
+++ b/src/main/scala/radiance/tile/VortexCore.scala
@@ -396,6 +396,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
 
   // tensor core
   addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv")
+  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv")
 //  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh")
   addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv")
   addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv")

From 6a3aa549d34ffbafd4154081fb063be847452114 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 15:02:25 -0700
Subject: [PATCH 02/47] Add skeleton for Hopper Tensor Core

---
 .../radiance/core/TensorCoreDecoupled.scala   | 70 +++++++++++++++++++
 .../radiance/TensorCoreDecoupledTest.scala    | 23 ++++++
 2 files changed, 93 insertions(+)
 create mode 100644 src/main/scala/radiance/core/TensorCoreDecoupled.scala
 create mode 100644 src/test/scala/radiance/TensorCoreDecoupledTest.scala

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
new file mode 100644
index 0000000..10bfedb
--- /dev/null
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -0,0 +1,70 @@
+// See LICENSE.SiFive for license details.
+// See LICENSE.Berkeley for license details.
+
+package radiance.core
+
+import chisel3._
+import chisel3.util._
+
+class TensorCoreDecoupled(val numWarps: Int, val numLanes: Int) extends Module {
+  val numWarpBits = log2Ceil(numWarps)
+  val wordSize = 4 // TODO FP16
+  val dataWidth = numLanes * wordSize // TODO FP16
+
+  val io = IO(new Bundle{
+    val initiate = Flipped(Decoupled(new Bundle{
+      val wid = UInt(numWarpBits.W)
+    }))
+    val dataA = Flipped(Decoupled(new TensorMemResp(dataWidth)))
+    val dataB = Flipped(Decoupled(new TensorMemResp(dataWidth)))
+    val addressA = Decoupled(new TensorMemReq)
+    val addressB = Decoupled(new TensorMemReq)
+    val writeback = Decoupled(new Bundle{
+      val wid = UInt(numWarpBits.W)
+      val last = Bool()
+    })
+  })
+
+  // FSM
+  //
+  val state = RegInit(TensorState.idle)
+  // TODO: just transition every cycle for now
+  state := (state match {
+    case TensorState.idle => Mux(io.initiate.fire, TensorState.smemRead, state)
+    case TensorState.smemRead => TensorState.compute
+    case TensorState.compute => TensorState.writeback
+    case TensorState.writeback => {
+      // hold until writeback is cleared
+      Mux(io.writeback.ready, TensorState.idle, state)
+    }
+    case _ => TensorState.idle
+  })
+
+  // TODO
+  io.dataA.ready := true.B
+  io.dataB.ready := true.B
+  io.addressA.valid := false.B
+  io.addressB.valid := false.B
+  io.addressA.bits := DontCare
+  io.addressB.bits := DontCare
+  io.initiate.ready := true.B
+  io.writeback.valid := true.B
+  io.writeback.bits := DontCare
+}
+
+class TensorMemReq extends Bundle {
+  // TODO: tag
+  val address = UInt(32.W)
+}
+class TensorMemResp(val dataWidth: Int) extends Bundle {
+  // TODO: tag
+  val data = UInt(32.W)
+}
+
+
+object TensorState extends ChiselEnum {
+  val idle      = Value(0.U)
+  val smemRead  = Value(1.U)
+  val compute   = Value(2.U)
+  val writeback = Value(3.U)
+}
diff --git a/src/test/scala/radiance/TensorCoreDecoupledTest.scala b/src/test/scala/radiance/TensorCoreDecoupledTest.scala
new file mode 100644
index 0000000..5dd734a
--- /dev/null
+++ b/src/test/scala/radiance/TensorCoreDecoupledTest.scala
@@ -0,0 +1,23 @@
+package radiance.core
+
+import chisel3._
+import chisel3.util._
+import chiseltest._
+import org.scalatest.flatspec.AnyFlatSpec
+
+class TensorCoreDecoupledTest extends AnyFlatSpec with ChiselScalatestTester {
+  behavior of "TensorCoreDecoupled"
+
+  it should "do the right thing" in {
+    test(new TensorCoreDecoupled(8, 8))
+      { c =>
+        c.io.initiate.valid.poke(true.B)
+        c.io.dataA.valid.poke(false.B)
+        c.io.dataA.bits.data.poke(0.U)
+        c.io.dataB.valid.poke(false.B)
+        c.io.dataB.bits.data.poke(0.U)
+        c.clock.step()
+        c.io.writeback.valid.expect(true.B)
+      }
+  }
+}

From 327615e330a83558af4445b79510d656e24add0a Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 17:28:51 -0700
Subject: [PATCH 03/47] Add state regs and init/writeback transition

---
 .../radiance/core/TensorCoreDecoupled.scala   | 134 +++++++++++++-----
 .../radiance/TensorCoreDecoupledTest.scala    |  13 +-
 2 files changed, 109 insertions(+), 38 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 10bfedb..87657d5 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -6,50 +6,126 @@ package radiance.core
 import chisel3._
 import chisel3.util._
 
-class TensorCoreDecoupled(val numWarps: Int, val numLanes: Int) extends Module {
+case class TensorTilingParams(
+  // Dimension of the SMEM tile
+  m: Int = 16,
+  n: Int = 16,
+  k: Int = 16,
+  // Dimension of the compute tile.  This is determined by the number of MAC
+  // units
+  mc: Int = 4,
+  nc: Int = 4,
+  kc: Int = 4
+)
+
+class TensorCoreDecoupled(
+    val numWarps: Int,
+    val numLanes: Int,
+    val tilingParams: TensorTilingParams
+) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
   val dataWidth = numLanes * wordSize // TODO FP16
 
-  val io = IO(new Bundle{
-    val initiate = Flipped(Decoupled(new Bundle{
+  val io = IO(new Bundle {
+    val initiate = Flipped(Decoupled(new Bundle {
       val wid = UInt(numWarpBits.W)
     }))
-    val dataA = Flipped(Decoupled(new TensorMemResp(dataWidth)))
-    val dataB = Flipped(Decoupled(new TensorMemResp(dataWidth)))
-    val addressA = Decoupled(new TensorMemReq)
-    val addressB = Decoupled(new TensorMemReq)
-    val writeback = Decoupled(new Bundle{
+    val writeback = Decoupled(new Bundle {
       val wid = UInt(numWarpBits.W)
       val last = Bool()
     })
+    val respA = Flipped(Decoupled(new TensorMemResp(dataWidth)))
+    val respB = Flipped(Decoupled(new TensorMemResp(dataWidth)))
+    val reqA = Decoupled(new TensorMemReq)
+    val reqB = Decoupled(new TensorMemReq)
   })
 
   // FSM
-  //
+  // ---
+  // This drives the overall pipeline of memory requests, dot-product unit
+  // operations and regfile writeback.
+
+  object TensorState extends ChiselEnum {
+    val idle = Value(0.U)
+    val run = Value(1.U)
+    // All set/step sequencing is complete and the tensor core is holding the
+    // result data until downstream writeback is ready.
+    // FIXME: is this necessary if writeback is decoupled with queues?
+    val finish = Value(2.U)
+  }
   val state = RegInit(TensorState.idle)
+  val busy = RegInit(false.B)
+  // Holds the warp id the core is currently working on.  Note that we only
+  // support one outstanding warp request
+  val warpReg = RegInit(0.U(numWarpBits.W))
+
   // TODO: just transition every cycle for now
-  state := (state match {
-    case TensorState.idle => Mux(io.initiate.fire, TensorState.smemRead, state)
-    case TensorState.smemRead => TensorState.compute
-    case TensorState.compute => TensorState.writeback
-    case TensorState.writeback => {
+  def nextState(state: TensorState.Type) = state match {
+    case TensorState.idle      => Mux(io.initiate.fire, TensorState.run, state)
+    case TensorState.run  => TensorState.finish
+    case TensorState.finish => {
       // hold until writeback is cleared
       Mux(io.writeback.ready, TensorState.idle, state)
     }
     case _ => TensorState.idle
-  })
+  }
+  state := nextState(state)
 
-  // TODO
-  io.dataA.ready := true.B
-  io.dataB.ready := true.B
-  io.addressA.valid := false.B
-  io.addressB.valid := false.B
-  io.addressA.bits := DontCare
-  io.addressB.bits := DontCare
-  io.initiate.ready := true.B
-  io.writeback.valid := true.B
-  io.writeback.bits := DontCare
+  // state table for every warp id
+  // sets: k iteration
+  val numSets = (tilingParams.k / tilingParams.kc)
+  val setBits = log2Ceil(numSets)
+  // steps: i-j iteration
+  val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
+  val stepBits = log2Ceil(numSteps)
+  val setReg = RegInit(0.U(setBits.W))
+  val stepReg = RegInit(0.U(setBits.W))
+  // val tableRow = Valid(new Bundle {
+  //   val set = UInt(setBits.W)
+  //   val step = UInt(stepBits.W)
+  // })
+
+  when(io.initiate.fire) {
+    val wid = io.initiate.bits.wid
+    busy := true.B
+    warpReg := wid
+    setReg := 0.U
+    stepReg := 0.U
+    when(io.writeback.fire) {
+      assert(io.writeback.bits.wid =/= wid,
+        "unsupported concurrent initiate and writeback to the same warp")
+    }
+  }
+  when (io.writeback.fire) {
+    busy := false.B
+  }
+
+  io.initiate.ready := !busy
+
+  // Writeback queues
+  // ----------------
+  // These queues hold the metadata necessary for register
+  // writeback.
+
+  // val queueDepth = 2
+  // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
+  // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
+
+  // Output logic
+  // ------------
+
+  io.writeback.valid := (state === TensorState.finish)
+  io.writeback.bits.wid := warpReg
+  io.writeback.bits.last := false.B // TODO
+
+  // FIXME
+  io.respA.ready := true.B
+  io.respB.ready := true.B
+  io.reqA.valid := false.B
+  io.reqB.valid := false.B
+  io.reqA.bits := DontCare
+  io.reqB.bits := DontCare
 }
 
 class TensorMemReq extends Bundle {
@@ -60,11 +136,3 @@ class TensorMemResp(val dataWidth: Int) extends Bundle {
   // TODO: tag
   val data = UInt(32.W)
 }
-
-
-object TensorState extends ChiselEnum {
-  val idle      = Value(0.U)
-  val smemRead  = Value(1.U)
-  val compute   = Value(2.U)
-  val writeback = Value(3.U)
-}
diff --git a/src/test/scala/radiance/TensorCoreDecoupledTest.scala b/src/test/scala/radiance/TensorCoreDecoupledTest.scala
index 5dd734a..b1e0e9a 100644
--- a/src/test/scala/radiance/TensorCoreDecoupledTest.scala
+++ b/src/test/scala/radiance/TensorCoreDecoupledTest.scala
@@ -9,13 +9,16 @@ class TensorCoreDecoupledTest extends AnyFlatSpec with ChiselScalatestTester {
   behavior of "TensorCoreDecoupled"
 
   it should "do the right thing" in {
-    test(new TensorCoreDecoupled(8, 8))
+    test(new TensorCoreDecoupled(8, 8, tilingParams = TensorTilingParams()))
       { c =>
         c.io.initiate.valid.poke(true.B)
-        c.io.dataA.valid.poke(false.B)
-        c.io.dataA.bits.data.poke(0.U)
-        c.io.dataB.valid.poke(false.B)
-        c.io.dataB.bits.data.poke(0.U)
+        c.io.initiate.bits.wid.poke(0.U)
+
+        c.io.respA.valid.poke(false.B)
+        c.io.respA.bits.data.poke(0.U)
+        c.io.respB.valid.poke(false.B)
+        c.io.respB.bits.data.poke(0.U)
+
         c.clock.step()
         c.io.writeback.valid.expect(true.B)
       }

From 3165108c8bad0f968278c82ede73f850ae4deaa0 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 19:47:00 -0700
Subject: [PATCH 04/47] Add synthesizable unit test for tensor

---
 .../radiance/core/TensorCoreDecoupled.scala   | 23 +++++++++-
 .../UnitTest.scala => unittest/Configs.scala} | 46 ++++++++++++++++---
 2 files changed, 62 insertions(+), 7 deletions(-)
 rename src/main/scala/radiance/{memory/UnitTest.scala => unittest/Configs.scala} (64%)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 87657d5..4744266 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -5,6 +5,8 @@ package radiance.core
 
 import chisel3._
 import chisel3.util._
+import org.chipsalliance.cde.config.Parameters
+import freechips.rocketchip.unittest.UnitTest
 
 case class TensorTilingParams(
   // Dimension of the SMEM tile
@@ -62,7 +64,7 @@ class TensorCoreDecoupled(
 
   // TODO: just transition every cycle for now
   def nextState(state: TensorState.Type) = state match {
-    case TensorState.idle      => Mux(io.initiate.fire, TensorState.run, state)
+    case TensorState.idle => Mux(io.initiate.fire, TensorState.run, state)
     case TensorState.run  => TensorState.finish
     case TensorState.finish => {
       // hold until writeback is cleared
@@ -136,3 +138,22 @@ class TensorMemResp(val dataWidth: Int) extends Bundle {
   // TODO: tag
   val data = UInt(32.W)
 }
+
+// synthesizable unit tests
+
+class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters)
+    extends UnitTest(timeout) {
+  val dut = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams()))
+  dut.io.initiate.valid := io.start
+  dut.io.initiate.bits.wid := 0.U
+  // TODO
+  dut.io.respA.valid := false.B
+  dut.io.respA.bits := DontCare
+  dut.io.respB.valid := false.B
+  dut.io.respB.bits := DontCare
+  dut.io.reqA.ready := true.B
+  dut.io.reqB.ready := true.B
+  dut.io.writeback.ready := true.B
+
+  io.finished := dut.io.writeback.valid
+}
diff --git a/src/main/scala/radiance/memory/UnitTest.scala b/src/main/scala/radiance/unittest/Configs.scala
similarity index 64%
rename from src/main/scala/radiance/memory/UnitTest.scala
rename to src/main/scala/radiance/unittest/Configs.scala
index c070ef4..065045c 100644
--- a/src/main/scala/radiance/memory/UnitTest.scala
+++ b/src/main/scala/radiance/unittest/Configs.scala
@@ -1,6 +1,6 @@
 // See LICENSE.SiFive for license details.
 
-package radiance.memory
+package radiance.unittest
 
 import chisel3._
 import org.chipsalliance.cde.config._
@@ -8,6 +8,8 @@ import freechips.rocketchip.subsystem.{BaseSubsystemConfig}
 import freechips.rocketchip.devices.tilelink._
 import freechips.rocketchip.tilelink._
 import freechips.rocketchip.util._
+import radiance.core.TensorCoreDecoupledTest
+import radiance.memory._
 import radiance.subsystem.WithSimtConfig
 import freechips.rocketchip.unittest._
 //import rocket.VortexFatBankTest
@@ -17,6 +19,16 @@ case object TestDurationMultiplier extends Field[Int]
 class WithTestDuration(x: Int) extends Config((site, here, up) => {
   case TestDurationMultiplier => x
 })
+
+class WithTensorUnitTests extends Config((site, _, _) => {
+  case UnitTests => (q: Parameters) => {
+    implicit val p = q
+    val timeout = 50000 * site(TestDurationMultiplier)
+    Seq(
+      Module(new TensorCoreDecoupledTest(timeout=timeout)),
+    ) }
+})
+
 class WithCoalescingUnitTests extends Config((site, _, _) => {
   case UnitTests => (q: Parameters) => {
     implicit val p = q
@@ -52,12 +64,34 @@ class WithCoalescingUnitSynthesisDummy(nLanes: Int) extends Config((site, _, _)
     ) }
 })
 
-class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nMemLanes=4) ++ new BaseSubsystemConfig)
+class TensorUnitTestConfig extends Config(
+  new WithTensorUnitTests ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
+
+class CoalescingUnitTestConfig extends Config(
+  new WithCoalescingUnitTests ++
+  new WithTestDuration(10) ++
+  new WithSimtConfig(nMemLanes=4) ++
+  new BaseSubsystemConfig)
+
 //class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nLanes=4) ++ new BaseSubsystemConfig)
 
 // Dummy configs of various sizes for synthesis
-class CoalescingSynthesisDummyLane4Config extends Config(new WithCoalescingUnitSynthesisDummy(4) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
-class CoalescingSynthesisDummyLane8Config extends Config(new WithCoalescingUnitSynthesisDummy(8) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
-class CoalescingSynthesisDummyLane16Config extends Config(new WithCoalescingUnitSynthesisDummy(16) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
-class CoalescingSynthesisDummyLane32Config extends Config(new WithCoalescingUnitSynthesisDummy(32) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
+class CoalescingSynthesisDummyLane4Config extends Config(
+  new WithCoalescingUnitSynthesisDummy(4) ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
+class CoalescingSynthesisDummyLane8Config extends Config(
+  new WithCoalescingUnitSynthesisDummy(8) ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
+class CoalescingSynthesisDummyLane16Config extends Config(
+  new WithCoalescingUnitSynthesisDummy(16) ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
+class CoalescingSynthesisDummyLane32Config extends Config(
+  new WithCoalescingUnitSynthesisDummy(32) ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
 

From 01f53a8be1b2ce9a46a5e7941b21feca2e20df8f Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 20:20:30 -0700
Subject: [PATCH 05/47] tensor: Sequence through set/steps

---
 .../radiance/core/TensorCoreDecoupled.scala   | 78 +++++++++++--------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 4744266..935ed40 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -42,6 +42,7 @@ class TensorCoreDecoupled(
     val reqA = Decoupled(new TensorMemReq)
     val reqB = Decoupled(new TensorMemReq)
   })
+  dontTouch(io)
 
   // FSM
   // ---
@@ -62,48 +63,70 @@ class TensorCoreDecoupled(
   // support one outstanding warp request
   val warpReg = RegInit(0.U(numWarpBits.W))
 
-  // TODO: just transition every cycle for now
-  def nextState(state: TensorState.Type) = state match {
-    case TensorState.idle => Mux(io.initiate.fire, TensorState.run, state)
-    case TensorState.run  => TensorState.finish
-    case TensorState.finish => {
-      // hold until writeback is cleared
-      Mux(io.writeback.ready, TensorState.idle, state)
-    }
-    case _ => TensorState.idle
-  }
-  state := nextState(state)
-
-  // state table for every warp id
   // sets: k iteration
   val numSets = (tilingParams.k / tilingParams.kc)
   val setBits = log2Ceil(numSets)
   // steps: i-j iteration
   val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
   val stepBits = log2Ceil(numSteps)
-  val setReg = RegInit(0.U(setBits.W))
-  val stepReg = RegInit(0.U(setBits.W))
-  // val tableRow = Valid(new Bundle {
-  //   val set = UInt(setBits.W)
-  //   val step = UInt(stepBits.W)
-  // })
+  val set = RegInit(0.U(setBits.W))
+  val step = RegInit(0.U(stepBits.W))
 
   when(io.initiate.fire) {
     val wid = io.initiate.bits.wid
     busy := true.B
     warpReg := wid
-    setReg := 0.U
-    stepReg := 0.U
+    set := 0.U
+    step := 0.U
     when(io.writeback.fire) {
-      assert(io.writeback.bits.wid =/= wid,
-        "unsupported concurrent initiate and writeback to the same warp")
+      assert(
+        io.writeback.bits.wid =/= wid,
+        "unsupported concurrent initiate and writeback to the same warp"
+      )
     }
   }
-  when (io.writeback.fire) {
+  when(io.writeback.fire) {
     busy := false.B
   }
 
+  // set/step sequencing logic
+  val nextStep = true.B // TODO
+  val lastSet = ((1 << setBits) - 1)
+  val lastStep = ((1 << stepBits) - 1)
+  val setDone = (set === lastSet.U)
+  val stepDone = (step === lastStep.U)
+  when (nextStep) {
+    step := (step + 1.U) & lastStep.U
+    when (stepDone) {
+      set  := (set + 1.U)  & lastSet.U
+    }
+  }
+
+  // state transition logic
+  switch(state) {
+    is(TensorState.idle) {
+      when(io.initiate.fire) {
+        state := TensorState.run
+      }
+    }
+    is(TensorState.run) {
+      when (setDone && stepDone && nextStep) {
+        when (state === TensorState.run) {
+          state := TensorState.finish
+        }
+      }
+    }
+    is(TensorState.finish) {
+      when(io.writeback.fire) {
+        state := TensorState.idle
+      }
+    }
+  }
+
   io.initiate.ready := !busy
+  io.writeback.valid := (state === TensorState.finish)
+  io.writeback.bits.wid := warpReg
+  io.writeback.bits.last := false.B // TODO
 
   // Writeback queues
   // ----------------
@@ -114,13 +137,6 @@ class TensorCoreDecoupled(
   // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
   // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 
-  // Output logic
-  // ------------
-
-  io.writeback.valid := (state === TensorState.finish)
-  io.writeback.bits.wid := warpReg
-  io.writeback.bits.last := false.B // TODO
-
   // FIXME
   io.respA.ready := true.B
   io.respB.ready := true.B

From 9ac8f2492cf89d73c3ac8f8612bb4d03d9871e35 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 20:54:24 -0700
Subject: [PATCH 06/47] tensor: Minimal diplomacy config for unittest

---
 .../radiance/core/TensorCoreDecoupled.scala   | 80 +++++++++++++++----
 1 file changed, 66 insertions(+), 14 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 935ed40..67d1c31 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -6,7 +6,10 @@ package radiance.core
 import chisel3._
 import chisel3.util._
 import org.chipsalliance.cde.config.Parameters
-import freechips.rocketchip.unittest.UnitTest
+import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
+import freechips.rocketchip.tilelink._
+import freechips.rocketchip.diplomacy.AddressSet
+import freechips.rocketchip.unittest.{UnitTest, UnitTestModule}
 
 case class TensorTilingParams(
   // Dimension of the SMEM tile
@@ -157,19 +160,68 @@ class TensorMemResp(val dataWidth: Int) extends Bundle {
 
 // synthesizable unit tests
 
+// wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy
+// network.
+class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
+  val node = TLClientNode(Seq(TLMasterPortParameters.v2(
+    Seq(TLMasterParameters.v2(
+      name = "TensorCoreDecoupledClientNode",
+      // sourceId : TODO
+    ))
+  )))
+
+  lazy val module = new TensorCoreDecoupledTLImp(this)
+}
+
+class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
+    extends LazyModuleImp(outer) with UnitTestModule {
+  val tensor = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams()))
+
+  require(outer.node.out.length == 1)
+
+  val (tlOut, edge) = outer.node.out(0)
+  tlOut.a.valid := tensor.io.reqA.valid
+  tlOut.a.bits.address := tensor.io.reqA.bits.address
+  tlOut.a.bits.source := 0.U // TODO: tensor.io.reqA.bits.source
+  tensor.io.respA.valid := tlOut.d.valid
+  tensor.io.respA.bits.data := tlOut.d.bits.data
+  // TODO: tensor.io.respA.bits.source := tlOut.d.bits.source
+
+  tensor.io.initiate.valid := io.start
+  tensor.io.initiate.bits.wid := 0.U
+  // TODO
+  tensor.io.respA.valid := false.B
+  tensor.io.respA.bits := DontCare
+  tensor.io.respB.valid := false.B
+  tensor.io.respB.bits := DontCare
+  tensor.io.reqA.ready := true.B
+  tensor.io.reqB.ready := true.B
+  tensor.io.writeback.ready := true.B
+
+  io.finished := tensor.io.writeback.valid
+}
+
+// a minimal Diplomacy graph with a tensor core and a TLRAM
+class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
+  val tensor = LazyModule(new TensorCoreDecoupledTL)
+  val ram = LazyModule(new TLRAM(
+    address = AddressSet(0x0000, 0xffffff),
+    beatBytes = 32 // FIXME: hardcoded
+  ))
+
+  ram.node :=* tensor.node
+
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with UnitTestModule {
+    tensor.module.io.start := io.start
+    io.finished := tensor.module.io.finished
+  }
+}
+
+// unit test harness
 class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters)
     extends UnitTest(timeout) {
-  val dut = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams()))
-  dut.io.initiate.valid := io.start
-  dut.io.initiate.bits.wid := 0.U
-  // TODO
-  dut.io.respA.valid := false.B
-  dut.io.respA.bits := DontCare
-  dut.io.respB.valid := false.B
-  dut.io.respB.bits := DontCare
-  dut.io.reqA.ready := true.B
-  dut.io.reqB.ready := true.B
-  dut.io.writeback.ready := true.B
-
-  io.finished := dut.io.writeback.valid
+  val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
+  dut.io.start := io.start
+  io.finished := dut.io.finished
 }

From bf6f7210b7cf591c118c779727fce73142c21be6 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 21:14:51 -0700
Subject: [PATCH 07/47] tensor: Generate TL traffic, separate edges for A and B

---
 .../radiance/core/TensorCoreDecoupled.scala   | 66 +++++++++++++------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 67d1c31..e61b542 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -101,10 +101,19 @@ class TensorCoreDecoupled(
   when (nextStep) {
     step := (step + 1.U) & lastStep.U
     when (stepDone) {
-      set  := (set + 1.U)  & lastSet.U
+      set := (set + 1.U) & lastSet.U
     }
   }
 
+  // memory traffic generation
+  io.reqA.valid := (state === TensorState.run) // FIXME
+  io.reqA.bits.address := 0.U // FIXME
+  io.respA.ready := true.B
+  io.respB.ready := true.B
+  // FIXME
+  io.reqB.valid := false.B
+  io.reqB.bits := DontCare
+
   // state transition logic
   switch(state) {
     is(TensorState.idle) {
@@ -139,14 +148,6 @@ class TensorCoreDecoupled(
   // val queueDepth = 2
   // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
   // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
-
-  // FIXME
-  io.respA.ready := true.B
-  io.respB.ready := true.B
-  io.reqA.valid := false.B
-  io.reqB.valid := false.B
-  io.reqA.bits := DontCare
-  io.reqB.bits := DontCare
 }
 
 class TensorMemReq extends Bundle {
@@ -163,12 +164,21 @@ class TensorMemResp(val dataWidth: Int) extends Bundle {
 // wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy
 // network.
 class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
-  val node = TLClientNode(Seq(TLMasterPortParameters.v2(
-    Seq(TLMasterParameters.v2(
-      name = "TensorCoreDecoupledClientNode",
-      // sourceId : TODO
-    ))
-  )))
+  // node with two edges; one for A and one for B matrix
+  val node = TLClientNode(Seq(
+    TLMasterPortParameters.v2(
+      Seq(TLMasterParameters.v2(
+        name = "TensorCoreDecoupledMatrixANode",
+        // sourceId : TODO
+      ))
+    ),
+    TLMasterPortParameters.v2(
+      Seq(TLMasterParameters.v2(
+        name = "TensorCoreDecoupledMatrixBNode",
+        // sourceId : TODO
+      ))
+    )
+  ))
 
   lazy val module = new TensorCoreDecoupledTLImp(this)
 }
@@ -176,13 +186,28 @@ class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
 class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
     extends LazyModuleImp(outer) with UnitTestModule {
   val tensor = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams()))
+  val wordSize = 4 // FIXME: hardcoded
 
-  require(outer.node.out.length == 1)
+  require(outer.node.out.length == 2/*A and B*/)
 
   val (tlOut, edge) = outer.node.out(0)
-  tlOut.a.valid := tensor.io.reqA.valid
-  tlOut.a.bits.address := tensor.io.reqA.bits.address
-  tlOut.a.bits.source := 0.U // TODO: tensor.io.reqA.bits.source
+  val (tlOutB, edgeB) = outer.node.out(1)
+
+  val zip = List((outer.node.out(0), tensor.io.reqA),
+                 (outer.node.out(1), tensor.io.reqB))
+  zip.foreach { case ((tl, edge), req) =>
+    tl.a.valid := req.valid
+    val (legal, bits) = edge.Get(
+      fromSource = 0.U, // TODO: sourceGen.io.id.bits,
+      toAddress = req.bits.address,
+      lgSize = log2Ceil(wordSize).U
+    )
+    tl.a.bits := bits
+    when(tl.a.fire) {
+      assert(legal, "illegal TL req gen")
+    }
+  }
+
   tensor.io.respA.valid := tlOut.d.valid
   tensor.io.respA.bits.data := tlOut.d.bits.data
   // TODO: tensor.io.respA.bits.source := tlOut.d.bits.source
@@ -204,12 +229,13 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
 // a minimal Diplomacy graph with a tensor core and a TLRAM
 class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
   val tensor = LazyModule(new TensorCoreDecoupledTL)
+  val xbar = LazyModule(new TLXbar)
   val ram = LazyModule(new TLRAM(
     address = AddressSet(0x0000, 0xffffff),
     beatBytes = 32 // FIXME: hardcoded
   ))
 
-  ram.node :=* tensor.node
+  ram.node :=* xbar.node :=* tensor.node
 
   lazy val module = new Impl
   class Impl extends LazyModuleImp(this) with UnitTestModule {

From 14a640bf2d9a29be63666b9a56c0342ee1e717b7 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 21:38:54 -0700
Subject: [PATCH 08/47] tensor: Do proper source generation

SourceGenerator keeps on givin'
---
 .../radiance/core/TensorCoreDecoupled.scala   | 88 ++++++++++++-------
 1 file changed, 55 insertions(+), 33 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index e61b542..7fa05ee 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -8,8 +8,9 @@ import chisel3.util._
 import org.chipsalliance.cde.config.Parameters
 import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
 import freechips.rocketchip.tilelink._
-import freechips.rocketchip.diplomacy.AddressSet
+import freechips.rocketchip.diplomacy.{IdRange, AddressSet}
 import freechips.rocketchip.unittest.{UnitTest, UnitTestModule}
+import radiance.memory.SourceGenerator
 
 case class TensorTilingParams(
   // Dimension of the SMEM tile
@@ -26,11 +27,13 @@ case class TensorTilingParams(
 class TensorCoreDecoupled(
     val numWarps: Int,
     val numLanes: Int,
+    val numSourceIds: Int,
     val tilingParams: TensorTilingParams
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
   val dataWidth = numLanes * wordSize // TODO FP16
+  val sourceWidth = log2Ceil(numSourceIds)
 
   val io = IO(new Bundle {
     val initiate = Flipped(Decoupled(new Bundle {
@@ -40,10 +43,10 @@ class TensorCoreDecoupled(
       val wid = UInt(numWarpBits.W)
       val last = Bool()
     })
-    val respA = Flipped(Decoupled(new TensorMemResp(dataWidth)))
-    val respB = Flipped(Decoupled(new TensorMemResp(dataWidth)))
-    val reqA = Decoupled(new TensorMemReq)
-    val reqB = Decoupled(new TensorMemReq)
+    val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
+    val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
+    val reqA = Decoupled(new TensorMemReq(sourceWidth))
+    val reqB = Decoupled(new TensorMemReq(sourceWidth))
   })
   dontTouch(io)
 
@@ -106,13 +109,25 @@ class TensorCoreDecoupled(
   }
 
   // memory traffic generation
-  io.reqA.valid := (state === TensorState.run) // FIXME
-  io.reqA.bits.address := 0.U // FIXME
+  val genReq = (state === TensorState.run)
+
+  List((io.reqA, io.respA), (io.reqB, io.respB)).foreach {
+    case (req, resp) => {
+      val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds)))
+
+      sourceGen.io.gen := req.fire
+      sourceGen.io.meta := DontCare
+      req.valid := genReq
+      req.bits.address := 0.U // FIXME
+      req.bits.source := sourceGen.io.id.bits
+
+      sourceGen.io.reclaim.valid := resp.fire
+      sourceGen.io.reclaim.bits := resp.bits.source
+    }
+  }
+
   io.respA.ready := true.B
   io.respB.ready := true.B
-  // FIXME
-  io.reqB.valid := false.B
-  io.reqB.bits := DontCare
 
   // state transition logic
   switch(state) {
@@ -150,12 +165,17 @@ class TensorCoreDecoupled(
   // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 }
 
-class TensorMemReq extends Bundle {
-  // TODO: tag
+class TensorMemReq(
+  sourceWidth: Int
+) extends Bundle {
+  val source = UInt(sourceWidth.W)
   val address = UInt(32.W)
 }
-class TensorMemResp(val dataWidth: Int) extends Bundle {
-  // TODO: tag
+class TensorMemResp(
+  sourceWidth: Int,
+  dataWidth: Int
+) extends Bundle {
+  val source = UInt(sourceWidth.W)
   val data = UInt(32.W)
 }
 
@@ -164,18 +184,20 @@ class TensorMemResp(val dataWidth: Int) extends Bundle {
 // wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy
 // network.
 class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
+  val numSrcIds = 4
+
   // node with two edges; one for A and one for B matrix
   val node = TLClientNode(Seq(
     TLMasterPortParameters.v2(
       Seq(TLMasterParameters.v2(
         name = "TensorCoreDecoupledMatrixANode",
-        // sourceId : TODO
+        sourceId = IdRange(0, numSrcIds)
       ))
     ),
     TLMasterPortParameters.v2(
       Seq(TLMasterParameters.v2(
         name = "TensorCoreDecoupledMatrixBNode",
-        // sourceId : TODO
+        sourceId = IdRange(0, numSrcIds)
       ))
     )
   ))
@@ -185,42 +207,42 @@ class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
 
 class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
     extends LazyModuleImp(outer) with UnitTestModule {
-  val tensor = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams()))
-  val wordSize = 4 // FIXME: hardcoded
-
   require(outer.node.out.length == 2/*A and B*/)
 
-  val (tlOut, edge) = outer.node.out(0)
-  val (tlOutB, edgeB) = outer.node.out(1)
+  val tensor = Module(new TensorCoreDecoupled(
+                      8, 8, outer.numSrcIds , TensorTilingParams()))
+  val wordSize = 4 // FIXME: hardcoded
 
   val zip = List((outer.node.out(0), tensor.io.reqA),
                  (outer.node.out(1), tensor.io.reqB))
   zip.foreach { case ((tl, edge), req) =>
     tl.a.valid := req.valid
     val (legal, bits) = edge.Get(
-      fromSource = 0.U, // TODO: sourceGen.io.id.bits,
+      fromSource = req.bits.source,
       toAddress = req.bits.address,
       lgSize = log2Ceil(wordSize).U
     )
     tl.a.bits := bits
+    req.ready := tl.a.ready
     when(tl.a.fire) {
       assert(legal, "illegal TL req gen")
     }
   }
 
-  tensor.io.respA.valid := tlOut.d.valid
-  tensor.io.respA.bits.data := tlOut.d.bits.data
-  // TODO: tensor.io.respA.bits.source := tlOut.d.bits.source
+  // TODO: dedup A and B
+  val (tlOutA, _) = outer.node.out(0)
+  val (tlOutB, _) = outer.node.out(1)
+  tensor.io.respA.valid := tlOutA.d.valid
+  tensor.io.respA.bits.data := tlOutA.d.bits.data
+  tensor.io.respA.bits.source := tlOutA.d.bits.source
+  tlOutA.d.ready := tensor.io.respA.ready
+  tensor.io.respB.valid := tlOutB.d.valid
+  tensor.io.respB.bits.data := tlOutB.d.bits.data
+  tensor.io.respB.bits.source := tlOutB.d.bits.source
+  tlOutB.d.ready := tensor.io.respB.ready
 
   tensor.io.initiate.valid := io.start
-  tensor.io.initiate.bits.wid := 0.U
-  // TODO
-  tensor.io.respA.valid := false.B
-  tensor.io.respA.bits := DontCare
-  tensor.io.respB.valid := false.B
-  tensor.io.respB.bits := DontCare
-  tensor.io.reqA.ready := true.B
-  tensor.io.reqB.ready := true.B
+  tensor.io.initiate.bits.wid := 0.U // FIXME
   tensor.io.writeback.ready := true.B
 
   io.finished := tensor.io.writeback.valid

From 8d2e13b4ee660e1df22cb7838d1ad36d2f16966d Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 22:06:58 -0700
Subject: [PATCH 09/47] tensor: Hold step until req fired for both A and B

---
 .../radiance/core/TensorCoreDecoupled.scala   | 45 ++++++++++++-------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 7fa05ee..05fe576 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -95,23 +95,10 @@ class TensorCoreDecoupled(
     busy := false.B
   }
 
-  // set/step sequencing logic
-  val nextStep = true.B // TODO
-  val lastSet = ((1 << setBits) - 1)
-  val lastStep = ((1 << stepBits) - 1)
-  val setDone = (set === lastSet.U)
-  val stepDone = (step === lastStep.U)
-  when (nextStep) {
-    step := (step + 1.U) & lastStep.U
-    when (stepDone) {
-      set := (set + 1.U) & lastSet.U
-    }
-  }
-
   // memory traffic generation
   val genReq = (state === TensorState.run)
 
-  List((io.reqA, io.respA), (io.reqB, io.respB)).foreach {
+  Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach {
     case (req, resp) => {
       val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds)))
 
@@ -126,9 +113,35 @@ class TensorCoreDecoupled(
     }
   }
 
+  // only advance to the next step if we fired mem requests for both A and B
+  val firedABReg = RegInit(VecInit(false.B, false.B))
+  val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map {
+    case (req, fired) => { when (req.fire) { fired := true.B } }
+    req.fire
+  })
+  val firedAB = (firedABNow.asUInt | firedABReg.asUInt)
+  val nextStep = firedAB.andR
+  // clear out firedABReg every step.  this will overwrite the previous fired
+  // write upon the last fire out of A and B
+  when (nextStep) {
+    firedABReg := Seq(false.B, false.B)
+  }
+
   io.respA.ready := true.B
   io.respB.ready := true.B
 
+  // set/step sequencing logic
+  val lastSet = ((1 << setBits) - 1)
+  val lastStep = ((1 << stepBits) - 1)
+  val setDone = (set === lastSet.U)
+  val stepDone = (step === lastStep.U)
+  when (nextStep) {
+    step := (step + 1.U) & lastStep.U
+    when (stepDone) {
+      set := (set + 1.U) & lastSet.U
+    }
+  }
+
   // state transition logic
   switch(state) {
     is(TensorState.idle) {
@@ -213,8 +226,8 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
                       8, 8, outer.numSrcIds , TensorTilingParams()))
   val wordSize = 4 // FIXME: hardcoded
 
-  val zip = List((outer.node.out(0), tensor.io.reqA),
-                 (outer.node.out(1), tensor.io.reqB))
+  val zip = Seq((outer.node.out(0), tensor.io.reqA),
+                (outer.node.out(1), tensor.io.reqB))
   zip.foreach { case ((tl, edge), req) =>
     tl.a.valid := req.valid
     val (legal, bits) = edge.Get(

From 90949f488bda6c65f4e23cea496983ba5ec53923 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 22:34:11 -0700
Subject: [PATCH 10/47] tensor: Add memory response queue

---
 .../radiance/core/TensorCoreDecoupled.scala   | 44 +++++++++++++++----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 05fe576..617659d 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -32,7 +32,7 @@ class TensorCoreDecoupled(
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
-  val dataWidth = numLanes * wordSize // TODO FP16
+  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
 
   val io = IO(new Bundle {
@@ -40,8 +40,9 @@ class TensorCoreDecoupled(
       val wid = UInt(numWarpBits.W)
     }))
     val writeback = Decoupled(new Bundle {
-      val wid = UInt(numWarpBits.W)
       val last = Bool()
+      val wid = UInt(numWarpBits.W)
+      val data = Vec(numLanes, UInt(wordSize.W))
     })
     val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
     val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
@@ -95,7 +96,9 @@ class TensorCoreDecoupled(
     busy := false.B
   }
 
-  // memory traffic generation
+  // Memory traffic generation
+  // -------------------------
+  //
   val genReq = (state === TensorState.run)
 
   Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach {
@@ -127,9 +130,33 @@ class TensorCoreDecoupled(
     firedABReg := Seq(false.B, false.B)
   }
 
-  io.respA.ready := true.B
-  io.respB.ready := true.B
+  io.respA.ready := true.B // FIXME
+  io.respB.ready := true.B // FIXME
 
+  // Execute stage
+  // -------------
+  // Execute backend of the decoupled access/execute pipeline.
+  //
+  val respQueueDepth = 4 // FIXME: parameterize
+  val respQueueA = Queue(io.respA, respQueueDepth)
+  val respQueueB = Queue(io.respB, respQueueDepth)
+  respQueueA.ready := io.writeback.ready // FIXME
+  respQueueB.ready := io.writeback.ready // FIXME
+
+  require(respQueueA.bits.data.widthOption.get ==
+          io.writeback.bits.data.widthOption.get * numLanes,
+    "response data width does not match the writeback data width")
+
+  // FIXME: debug dummy: pipe A directly to writeback
+  io.writeback.valid := respQueueA.valid
+  val groupedRespA = respQueueA.bits.data.asBools.grouped(wordSize * 8/*bits*/)
+  (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) =>
+    wb := VecInit(data).asUInt
+  }
+
+  // State transition
+  // ----------------
+  //
   // set/step sequencing logic
   val lastSet = ((1 << setBits) - 1)
   val lastStep = ((1 << stepBits) - 1)
@@ -142,7 +169,6 @@ class TensorCoreDecoupled(
     }
   }
 
-  // state transition logic
   switch(state) {
     is(TensorState.idle) {
       when(io.initiate.fire) {
@@ -189,13 +215,13 @@ class TensorMemResp(
   dataWidth: Int
 ) extends Bundle {
   val source = UInt(sourceWidth.W)
-  val data = UInt(32.W)
+  val data = UInt(dataWidth.W)
 }
 
 // synthesizable unit tests
 
-// wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy
-// network.
+// wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy
+// graph.
 class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
   val numSrcIds = 4
 

From ab8d3554bb134fd96e622cd3c0a13406adbdff34 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 15:45:52 -0700
Subject: [PATCH 11/47] Bump vortex to tensor-decoupled

---
 src/main/resources/vsrc/vortex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex
index da54162..4dcbc31 160000
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
@@ -1 +1 @@
-Subproject commit da54162241da020807274bd4087844d379d8170e
+Subproject commit 4dcbc31a88915fff35ccefd00c6e753fa5ef135a

From 2ca2ee37b0fffeb7225940b03c206f3237f10b85 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 15:45:59 -0700
Subject: [PATCH 12/47] tensor: Fix writeback datawidth

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 617659d..65246f6 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -42,7 +42,7 @@ class TensorCoreDecoupled(
     val writeback = Decoupled(new Bundle {
       val last = Bool()
       val wid = UInt(numWarpBits.W)
-      val data = Vec(numLanes, UInt(wordSize.W))
+      val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W))
     })
     val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
     val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
@@ -135,7 +135,7 @@ class TensorCoreDecoupled(
 
   // Execute stage
   // -------------
-  // Execute backend of the decoupled access/execute pipeline.
+  // Backend of the decoupled access/execute pipeline.
   //
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(io.respA, respQueueDepth)
@@ -144,7 +144,7 @@ class TensorCoreDecoupled(
   respQueueB.ready := io.writeback.ready // FIXME
 
   require(respQueueA.bits.data.widthOption.get ==
-          io.writeback.bits.data.widthOption.get * numLanes,
+          io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
   // FIXME: debug dummy: pipe A directly to writeback

From de393115cd97f812ceb91ef94598ca8d46570202 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 16:48:39 -0700
Subject: [PATCH 13/47] tensor: Translate TL response source to set/step tag

---
 .../radiance/core/TensorCoreDecoupled.scala   | 79 +++++++++++++------
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 65246f6..43dc1ca 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -32,8 +32,8 @@ class TensorCoreDecoupled(
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
-  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
+  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
 
   val io = IO(new Bundle {
     val initiate = Flipped(Decoupled(new Bundle {
@@ -51,6 +51,27 @@ class TensorCoreDecoupled(
   })
   dontTouch(io)
 
+  class TensorMemReq(
+    sourceWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val address = UInt(32.W)
+  }
+  class TensorMemResp(
+    sourceWidth: Int,
+    dataWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val data = UInt(dataWidth.W)
+  }
+  // mem response after translation from TL source to set/step tag
+  class TensorMemRespWithTag(
+    dataWidth: Int
+  ) extends Bundle {
+    val tag = new TensorMemTag
+    val data = UInt(dataWidth.W)
+  }
+
   // FSM
   // ---
   // This drives the overall pipeline of memory requests, dot-product unit
@@ -101,18 +122,39 @@ class TensorCoreDecoupled(
   //
   val genReq = (state === TensorState.run)
 
-  Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach {
-    case (req, resp) => {
-      val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds)))
+  class TensorMemTag extends Bundle {
+    val set = UInt(setBits.W)
+    val step = UInt(stepBits.W)
+  }
+  // use concatenation of set/step as the memory request source.  This will get
+  // translated to the actual TL sourcewidth in sourceGen.
+  val tag = Wire(new TensorMemTag)
+  tag.set := set
+  tag.step := step
+
+  val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
+  val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
+  Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach {
+    case (req, (resp, respTagged)) => {
+      val sourceGen = Module(new SourceGenerator(
+        log2Ceil(numSourceIds),
+        metadata = Some(tag)
+      ))
 
       sourceGen.io.gen := req.fire
-      sourceGen.io.meta := DontCare
+      sourceGen.io.meta := tag
       req.valid := genReq
       req.bits.address := 0.U // FIXME
       req.bits.source := sourceGen.io.id.bits
 
       sourceGen.io.reclaim.valid := resp.fire
       sourceGen.io.reclaim.bits := resp.bits.source
+
+      // translate source
+      respTagged.valid := resp.valid
+      respTagged.bits.tag := sourceGen.io.peek
+      respTagged.bits.data := resp.bits.data
+      resp.ready := respTagged.ready
     }
   }
 
@@ -130,16 +172,13 @@ class TensorCoreDecoupled(
     firedABReg := Seq(false.B, false.B)
   }
 
-  io.respA.ready := true.B // FIXME
-  io.respB.ready := true.B // FIXME
-
   // Execute stage
   // -------------
   // Backend of the decoupled access/execute pipeline.
   //
   val respQueueDepth = 4 // FIXME: parameterize
-  val respQueueA = Queue(io.respA, respQueueDepth)
-  val respQueueB = Queue(io.respB, respQueueDepth)
+  val respQueueA = Queue(respATagged, respQueueDepth)
+  val respQueueB = Queue(respBTagged, respQueueDepth)
   respQueueA.ready := io.writeback.ready // FIXME
   respQueueB.ready := io.writeback.ready // FIXME
 
@@ -149,9 +188,11 @@ class TensorCoreDecoupled(
 
   // FIXME: debug dummy: pipe A directly to writeback
   io.writeback.valid := respQueueA.valid
-  val groupedRespA = respQueueA.bits.data.asBools.grouped(wordSize * 8/*bits*/)
+  val groupedRespA = respQueueA.bits.data
+                     .asBools.grouped(wordSize * 8/*bits*/)
+                     .map(VecInit(_).asUInt)
   (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) =>
-    wb := VecInit(data).asUInt
+    wb := data
   }
 
   // State transition
@@ -204,20 +245,6 @@ class TensorCoreDecoupled(
   // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 }
 
-class TensorMemReq(
-  sourceWidth: Int
-) extends Bundle {
-  val source = UInt(sourceWidth.W)
-  val address = UInt(32.W)
-}
-class TensorMemResp(
-  sourceWidth: Int,
-  dataWidth: Int
-) extends Bundle {
-  val source = UInt(sourceWidth.W)
-  val data = UInt(dataWidth.W)
-}
-
 // synthesizable unit tests
 
 // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy

From efaf599fbe679f0e5e7ef671522408f34984057e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 17:08:14 -0700
Subject: [PATCH 14/47] tensor: Assert alignment of A and B response queues

---
 .../radiance/core/TensorCoreDecoupled.scala   | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 43dc1ca..4f5ecb3 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -97,15 +97,16 @@ class TensorCoreDecoupled(
   // steps: i-j iteration
   val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
   val stepBits = log2Ceil(numSteps)
-  val set = RegInit(0.U(setBits.W))
-  val step = RegInit(0.U(stepBits.W))
+  // set and step being currently accessed in the acc/ex frontend
+  val setAccess = RegInit(0.U(setBits.W))
+  val stepAccess = RegInit(0.U(stepBits.W))
 
   when(io.initiate.fire) {
     val wid = io.initiate.bits.wid
     busy := true.B
     warpReg := wid
-    set := 0.U
-    step := 0.U
+    setAccess := 0.U
+    stepAccess := 0.U
     when(io.writeback.fire) {
       assert(
         io.writeback.bits.wid =/= wid,
@@ -129,8 +130,8 @@ class TensorCoreDecoupled(
   // use concatenation of set/step as the memory request source.  This will get
   // translated to the actual TL sourcewidth in sourceGen.
   val tag = Wire(new TensorMemTag)
-  tag.set := set
-  tag.step := step
+  tag.set := setAccess
+  tag.step := stepAccess
 
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
@@ -176,16 +177,32 @@ class TensorCoreDecoupled(
   // -------------
   // Backend of the decoupled access/execute pipeline.
   //
+  // set and step being currently executed in the acc/ex backend
+  val setExecute = RegInit(0.U(setBits.W))
+  val stepExecute = RegInit(0.U(stepBits.W))
+
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
-  respQueueA.ready := io.writeback.ready // FIXME
-  respQueueB.ready := io.writeback.ready // FIXME
 
   require(respQueueA.bits.data.widthOption.get ==
           io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
+  val bothQueueValid = (respQueueA.valid && respQueueB.valid)
+  // assume in-order response and that A/B responses are always aligned; this
+  // might be too strong an assumption depending on the backing memory
+  when (bothQueueValid) {
+    assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
+           (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
+           "A and B response queue pointing to different set/steps. " ++
+           "This might indicate memory response coming back out-of-order.")
+  }
+  // synchronized dequeue
+  val deqResp = bothQueueValid && io.writeback.ready
+  respQueueA.ready := deqResp
+  respQueueB.ready := deqResp
+
   // FIXME: debug dummy: pipe A directly to writeback
   io.writeback.valid := respQueueA.valid
   val groupedRespA = respQueueA.bits.data
@@ -201,12 +218,12 @@ class TensorCoreDecoupled(
   // set/step sequencing logic
   val lastSet = ((1 << setBits) - 1)
   val lastStep = ((1 << stepBits) - 1)
-  val setDone = (set === lastSet.U)
-  val stepDone = (step === lastStep.U)
+  val setDone = (setAccess === lastSet.U)
+  val stepDone = (stepAccess === lastStep.U)
   when (nextStep) {
-    step := (step + 1.U) & lastStep.U
+    stepAccess := (stepAccess + 1.U) & lastStep.U
     when (stepDone) {
-      set := (set + 1.U) & lastSet.U
+      setAccess := (setAccess + 1.U) & lastSet.U
     }
   }
 

From e2abe1cffdc3a658b0acc5b2cb36a82d5a3450ec Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 19:12:15 -0700
Subject: [PATCH 15/47] tensor: Sequence set/steps in the execute-side

---
 .../radiance/core/TensorCoreDecoupled.scala   | 52 ++++++++++++-------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 4f5ecb3..7c07564 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -97,9 +97,16 @@ class TensorCoreDecoupled(
   // steps: i-j iteration
   val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
   val stepBits = log2Ceil(numSteps)
+  val lastSet = ((1 << setBits) - 1)
+  val lastStep = ((1 << stepBits) - 1)
+  def setDone(set: UInt) = (set === lastSet.U)
+  def stepDone(step: UInt) = (step === lastStep.U)
+
   // set and step being currently accessed in the acc/ex frontend
   val setAccess = RegInit(0.U(setBits.W))
   val stepAccess = RegInit(0.U(stepBits.W))
+  dontTouch(setAccess)
+  dontTouch(stepAccess)
 
   when(io.initiate.fire) {
     val wid = io.initiate.bits.wid
@@ -118,6 +125,9 @@ class TensorCoreDecoupled(
     busy := false.B
   }
 
+  // serialize every HGMMA request
+  io.initiate.ready := !busy
+
   // Memory traffic generation
   // -------------------------
   //
@@ -166,10 +176,10 @@ class TensorCoreDecoupled(
     req.fire
   })
   val firedAB = (firedABNow.asUInt | firedABReg.asUInt)
-  val nextStep = firedAB.andR
+  val nextStepAccess = firedAB.andR
   // clear out firedABReg every step.  this will overwrite the previous fired
   // write upon the last fire out of A and B
-  when (nextStep) {
+  when (nextStepAccess) {
     firedABReg := Seq(false.B, false.B)
   }
 
@@ -180,6 +190,8 @@ class TensorCoreDecoupled(
   // set and step being currently executed in the acc/ex backend
   val setExecute = RegInit(0.U(setBits.W))
   val stepExecute = RegInit(0.U(stepBits.W))
+  dontTouch(setExecute)
+  dontTouch(stepExecute)
 
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
@@ -198,13 +210,19 @@ class TensorCoreDecoupled(
            "A and B response queue pointing to different set/steps. " ++
            "This might indicate memory response coming back out-of-order.")
   }
-  // synchronized dequeue
+  // dequeue is synchronized between A and B
+  // FIXME: this need to change to dpu_ready
   val deqResp = bothQueueValid && io.writeback.ready
   respQueueA.ready := deqResp
   respQueueB.ready := deqResp
+  // FIXME: this need to change to dpu_fire
+  val nextStepExecute = io.writeback.fire
+
+  io.writeback.valid := bothQueueValid
+  io.writeback.bits.wid := warpReg
+  io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
 
   // FIXME: debug dummy: pipe A directly to writeback
-  io.writeback.valid := respQueueA.valid
   val groupedRespA = respQueueA.bits.data
                      .asBools.grouped(wordSize * 8/*bits*/)
                      .map(VecInit(_).asUInt)
@@ -216,16 +234,17 @@ class TensorCoreDecoupled(
   // ----------------
   //
   // set/step sequencing logic
-  val lastSet = ((1 << setBits) - 1)
-  val lastStep = ((1 << stepBits) - 1)
-  val setDone = (setAccess === lastSet.U)
-  val stepDone = (stepAccess === lastStep.U)
-  when (nextStep) {
-    stepAccess := (stepAccess + 1.U) & lastStep.U
-    when (stepDone) {
-      setAccess := (setAccess + 1.U) & lastSet.U
+
+  def sequenceSetStep(set: UInt, step: UInt, nextStep: Bool) = {
+    when (nextStep) {
+      step := (step + 1.U) & lastStep.U
+      when (stepDone(step)) {
+        set := (set + 1.U) & lastSet.U
+      }
     }
   }
+  sequenceSetStep(setAccess, stepAccess, nextStepAccess)
+  sequenceSetStep(setExecute, stepExecute, nextStepExecute)
 
   switch(state) {
     is(TensorState.idle) {
@@ -234,7 +253,7 @@ class TensorCoreDecoupled(
       }
     }
     is(TensorState.run) {
-      when (setDone && stepDone && nextStep) {
+      when (setDone(setAccess) && stepDone(stepAccess) && nextStepAccess) {
         when (state === TensorState.run) {
           state := TensorState.finish
         }
@@ -247,11 +266,6 @@ class TensorCoreDecoupled(
     }
   }
 
-  io.initiate.ready := !busy
-  io.writeback.valid := (state === TensorState.finish)
-  io.writeback.bits.wid := warpReg
-  io.writeback.bits.last := false.B // TODO
-
   // Writeback queues
   // ----------------
   // These queues hold the metadata necessary for register
@@ -328,7 +342,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
   tensor.io.initiate.bits.wid := 0.U // FIXME
   tensor.io.writeback.ready := true.B
 
-  io.finished := tensor.io.writeback.valid
+  io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last
 }
 
 // a minimal Diplomacy graph with a tensor core and a TLRAM

From 444dd5d7e1c54ab78111fdcfac9ebf3145809f02 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 14:25:38 -0700
Subject: [PATCH 16/47] tensor: Add destination reg to IO

---
 .../scala/radiance/core/TensorCoreDecoupled.scala  | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 7c07564..92f98b7 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -28,12 +28,14 @@ class TensorCoreDecoupled(
     val numWarps: Int,
     val numLanes: Int,
     val numSourceIds: Int,
-    val tilingParams: TensorTilingParams
+    val tilingParams: TensorTilingParams,
+    val numFPRegs: Int = 32
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
   val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
+  val numFPRegBits = log2Ceil(numFPRegs)
 
   val io = IO(new Bundle {
     val initiate = Flipped(Decoupled(new Bundle {
@@ -42,6 +44,7 @@ class TensorCoreDecoupled(
     val writeback = Decoupled(new Bundle {
       val last = Bool()
       val wid = UInt(numWarpBits.W)
+      val rd = UInt(numFPRegBits.W)
       val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W))
     })
     val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
@@ -218,8 +221,17 @@ class TensorCoreDecoupled(
   // FIXME: this need to change to dpu_fire
   val nextStepExecute = io.writeback.fire
 
+  def rdGen(set: UInt, step: UInt): UInt = {
+    // each step produces 4x4 output tile, written by 8 threads with 2 regs per
+    // thread
+    require(numLanes == 8, "currently assumes 8-wide warps")
+    (Cat(set, step) >> 1/*2 regs/thread*/)
+    // FIXME: add substep here
+  }
+
   io.writeback.valid := bothQueueValid
   io.writeback.bits.wid := warpReg
+  io.writeback.bits.rd := rdGen(setExecute, stepExecute)
   io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
 
   // FIXME: debug dummy: pipe A directly to writeback

From 77dae3e1f9941d15c213b19a43cd82bd0e00c81c Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 21:21:48 -0700
Subject: [PATCH 17/47] tensor: Write staging pipeline for A tile

---
 .../radiance/core/TensorCoreDecoupled.scala   | 103 ++++++++++++++----
 src/main/scala/radiance/core/TensorDPU.scala  |   1 +
 2 files changed, 83 insertions(+), 21 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 92f98b7..69b84f9 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -108,8 +108,12 @@ class TensorCoreDecoupled(
   // set and step being currently accessed in the acc/ex frontend
   val setAccess = RegInit(0.U(setBits.W))
   val stepAccess = RegInit(0.U(stepBits.W))
+  // we need full 4x4 A tile to fire DPU, but since the memory width is 8
+  // words, we need 2 cycles to read A.  `substep` tells which cycle we're at.
+  val substepAccess = RegInit(0.U(1.W))
   dontTouch(setAccess)
   dontTouch(stepAccess)
+  dontTouch(substepAccess)
 
   when(io.initiate.fire) {
     val wid = io.initiate.bits.wid
@@ -139,16 +143,19 @@ class TensorCoreDecoupled(
   class TensorMemTag extends Bundle {
     val set = UInt(setBits.W)
     val step = UInt(stepBits.W)
+    val substep = UInt(1.W)
   }
   // use concatenation of set/step as the memory request source.  This will get
   // translated to the actual TL sourcewidth in sourceGen.
   val tag = Wire(new TensorMemTag)
   tag.set := setAccess
   tag.step := stepAccess
+  tag.substep := substepAccess
 
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
-  Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach {
+  Seq((io.reqA, (io.respA, respATagged)),
+      (io.reqB, (io.respB, respBTagged))).foreach {
     case (req, (resp, respTagged)) => {
       val sourceGen = Module(new SourceGenerator(
         log2Ceil(numSourceIds),
@@ -173,18 +180,22 @@ class TensorCoreDecoupled(
   }
 
   // only advance to the next step if we fired mem requests for both A and B
+  // TODO: @perf: too strict? should be able to have A and B progress
+  // separately
   val firedABReg = RegInit(VecInit(false.B, false.B))
   val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map {
     case (req, fired) => { when (req.fire) { fired := true.B } }
     req.fire
   })
   val firedAB = (firedABNow.asUInt | firedABReg.asUInt)
-  val nextStepAccess = firedAB.andR
-  // clear out firedABReg every step.  this will overwrite the previous fired
-  // write upon the last fire out of A and B
-  when (nextStepAccess) {
+  val nextSubstepAccess = firedAB.andR
+  val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U)
+  // clear out firedABReg every substep
+  when (nextSubstepAccess) {
     firedABReg := Seq(false.B, false.B)
+    substepAccess := substepAccess + 1.U
   }
+  require(substepAccess.widthOption.get == 1, "there should be only two substeps")
 
   // Execute stage
   // -------------
@@ -204,22 +215,72 @@ class TensorCoreDecoupled(
           io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
-  val bothQueueValid = (respQueueA.valid && respQueueB.valid)
-  // assume in-order response and that A/B responses are always aligned; this
-  // might be too strong an assumption depending on the backing memory
-  when (bothQueueValid) {
-    assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
-           (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
-           "A and B response queue pointing to different set/steps. " ++
-           "This might indicate memory response coming back out-of-order.")
-  }
-  // dequeue is synchronized between A and B
   // FIXME: this need to change to dpu_ready
-  val deqResp = bothQueueValid && io.writeback.ready
-  respQueueA.ready := deqResp
-  respQueueB.ready := deqResp
-  // FIXME: this need to change to dpu_fire
-  val nextStepExecute = io.writeback.fire
+  val dpuReady = io.writeback.ready // FIXME: this need be actual dpu
+
+  val substepExecute = RegInit(0.U(1.W))
+  when (respQueueA.fire) {
+    substepExecute := substepExecute + 1.U
+  }
+  dontTouch(substepExecute)
+
+  // note combinationally coupled ready with `pipe`
+  val halfAQueue = Module(new Queue(
+    chiselTypeOf(respQueueA.bits.data), entries = 1, pipe = true
+  ))
+  halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U)
+  halfAQueue.io.enq.bits := respQueueA.bits.data
+
+  // we need the full data for A because we divide the D tile by half along N;
+  // for B, the DPU can immediately start computing with a 4x2 tile.
+  //
+  // substep == 0 data goes to the LSB
+  val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits)
+  val fullAQueue = Module(new Queue(
+    chiselTypeOf(fullAEnqData), entries = 1, pipe = true
+  ))
+  // hold first half A data for the first substep
+  halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) &&
+                             fullAQueue.io.enq.ready
+
+  require(fullAEnqData.widthOption.get == dataWidth * 2,
+          "assumes 2-cycle read for a full compute tile of A")
+  fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) &&
+                             halfAQueue.io.deq.valid
+  fullAQueue.io.enq.bits := fullAEnqData
+
+  val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME?
+  val dpuFire = operandsValid && dpuReady
+  fullAQueue.io.deq.ready := dpuFire
+  val nextStepExecute = dpuFire
+
+  // FIXME: need to hold A for two cycles!!
+
+  // make sure to dequeue from response queues only when both A and B valid
+  respQueueA.ready := MuxCase(false.B,
+                              Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
+                                  (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
+  respQueueB.ready := dpuFire
+  dontTouch(respQueueA)
+  dontTouch(respQueueB)
+
+  // assert that the A and B response queue heads always point to the same
+  // set/step/substep
+  //
+  // this assumes that memory responses come back in-order.  this might be too
+  // strong an assumption depending on the backing memory
+  def assertAligned = {
+    val bothQueueValid = (respQueueA.valid && respQueueB.valid)
+    when (bothQueueValid && (substepExecute === 0.U)) {
+      assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
+        (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
+        "A and B response queue pointing to different set/steps. " ++
+        "This might indicate memory response coming back out-of-order.")
+    }
+    dontTouch(respQueueA.bits.tag)
+    dontTouch(respQueueB.bits.tag)
+  }
+  assertAligned
 
   def rdGen(set: UInt, step: UInt): UInt = {
     // each step produces 4x4 output tile, written by 8 threads with 2 regs per
@@ -229,7 +290,7 @@ class TensorCoreDecoupled(
     // FIXME: add substep here
   }
 
-  io.writeback.valid := bothQueueValid
+  io.writeback.valid := operandsValid // FIXME: bypass logic
   io.writeback.bits.wid := warpReg
   io.writeback.bits.rd := rdGen(setExecute, stepExecute)
   io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala
index 4e6cee7..a82bed7 100644
--- a/src/main/scala/radiance/core/TensorDPU.scala
+++ b/src/main/scala/radiance/core/TensorDPU.scala
@@ -27,6 +27,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
       val b = Vec(dotProductDim, Bits((inFLen).W))
       val c = Bits((outFLen).W) // note C has the out length for accumulation
     }))
+    // 'stall' is effectively out.ready, combinationally coupled to in.ready
     val stall = Input(Bool())
     val out = Valid(new Bundle {
       val data = Bits((outFLen).W)

From 6cad8edd1838642cbbb61ef6998c8318d96864e1 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 22:01:02 -0700
Subject: [PATCH 18/47] tensor: Fix operand alignment in pipelining

---
 .../radiance/core/TensorCoreDecoupled.scala   | 56 +++++++++++--------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 69b84f9..0654df3 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -224,37 +224,51 @@ class TensorCoreDecoupled(
   }
   dontTouch(substepExecute)
 
+  // Do pipelining for the A operand so that we obtain the full 4x4 A tile
+  // ready for compute.  The pipeline is two-stage:
+  //   - stage one (halfAQueue) for assembling the full A tile from half-tiles
+  //     coming from the resp queue, and
+  //   - stage two (fullAQueue) for holding the full A tile until it gets
+  //     matched with two 4x2 B tiles, and compute is complete.
+  //
+  // Note that the half-tile assembly is unnecessary for B since the B tile is
+  // only 4x2.
+  // Also send the set/step tag along the pipe for alignment check.
+
   // note combinationally coupled ready with `pipe`
   val halfAQueue = Module(new Queue(
-    chiselTypeOf(respQueueA.bits.data), entries = 1, pipe = true
+    chiselTypeOf(respQueueA.bits), entries = 1, pipe = true
   ))
   halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U)
-  halfAQueue.io.enq.bits := respQueueA.bits.data
+  halfAQueue.io.enq.bits := respQueueA.bits
 
-  // we need the full data for A because we divide the D tile by half along N;
-  // for B, the DPU can immediately start computing with a 4x2 tile.
-  //
   // substep == 0 data goes to the LSB
-  val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits)
+  val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits.data)
+  require(fullAEnqData.widthOption.get == dataWidth * 2,
+          "assumes 2-cycle read for a full compute tile of A")
+  // only use the lower halfA's tag.  substep will be incorrect.
+  val fullAEnqTag = halfAQueue.io.deq.bits.tag
   val fullAQueue = Module(new Queue(
-    chiselTypeOf(fullAEnqData), entries = 1, pipe = true
+    new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
   ))
   // hold first half A data for the first substep
   halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) &&
                              fullAQueue.io.enq.ready
-
-  require(fullAEnqData.widthOption.get == dataWidth * 2,
-          "assumes 2-cycle read for a full compute tile of A")
   fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) &&
                              halfAQueue.io.deq.valid
-  fullAQueue.io.enq.bits := fullAEnqData
+  fullAQueue.io.enq.bits.data := fullAEnqData
+  fullAQueue.io.enq.bits.tag := fullAEnqTag
 
   val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME?
   val dpuFire = operandsValid && dpuReady
-  fullAQueue.io.deq.ready := dpuFire
-  val nextStepExecute = dpuFire
+  val substepCompute = RegInit(0.U(1.W))
+  when (dpuFire) {
+    substepCompute := substepCompute + 1.U
+  }
 
-  // FIXME: need to hold A for two cycles!!
+  // hold full A until two-cycle compute is done
+  fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
+  val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
   // make sure to dequeue from response queues only when both A and B valid
   respQueueA.ready := MuxCase(false.B,
@@ -264,21 +278,17 @@ class TensorCoreDecoupled(
   dontTouch(respQueueA)
   dontTouch(respQueueB)
 
-  // assert that the A and B response queue heads always point to the same
-  // set/step/substep
+  // assert that the DPU is computing with operands of the same set/step
   //
   // this assumes that memory responses come back in-order.  this might be too
   // strong an assumption depending on the backing memory
   def assertAligned = {
-    val bothQueueValid = (respQueueA.valid && respQueueB.valid)
-    when (bothQueueValid && (substepExecute === 0.U)) {
-      assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
-        (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
-        "A and B response queue pointing to different set/steps. " ++
+    when (dpuFire) {
+      assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) &&
+             (fullAQueue.io.deq.bits.tag.step === respQueueB.bits.tag.step),
+        "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")
     }
-    dontTouch(respQueueA.bits.tag)
-    dontTouch(respQueueB.bits.tag)
   }
   assertAligned
 

From 23edc34c7ebc28623a5961abc654d7f4049c4864 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 22:15:35 -0700
Subject: [PATCH 19/47] tensor: Add two TLRAM config for full throughput test

---
 .../radiance/core/TensorCoreDecoupled.scala   | 38 ++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 0654df3..154a3cf 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -155,8 +155,8 @@ class TensorCoreDecoupled(
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
-      (io.reqB, (io.respB, respBTagged))).foreach {
-    case (req, (resp, respTagged)) => {
+      (io.reqB, (io.respB, respBTagged))).zipWithIndex.foreach {
+    case ((req, (resp, respTagged)), i) => {
       val sourceGen = Module(new SourceGenerator(
         log2Ceil(numSourceIds),
         metadata = Some(tag)
@@ -165,7 +165,9 @@ class TensorCoreDecoupled(
       sourceGen.io.gen := req.fire
       sourceGen.io.meta := tag
       req.valid := genReq
-      req.bits.address := 0.U // FIXME
+      // FIXME: bogus address
+      // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B
+      req.bits.address := 0.U
       req.bits.source := sourceGen.io.id.bits
 
       sourceGen.io.reclaim.valid := resp.fire
@@ -270,7 +272,8 @@ class TensorCoreDecoupled(
   fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
   val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
-  // make sure to dequeue from response queues only when both A and B valid
+  // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
+  // on the substep
   respQueueA.ready := MuxCase(false.B,
                               Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
                                   (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
@@ -446,10 +449,35 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
   }
 }
 
+// two separate TLRAMs for A and B for full throughput
+class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
+  val tensor = LazyModule(new TensorCoreDecoupledTL)
+  val xbar = LazyModule(new TLXbar)
+  val ramA = LazyModule(new TLRAM(
+    address = AddressSet(0x000, 0xfffeff),
+    beatBytes = 32 // FIXME: hardcoded
+  ))
+  val ramB = LazyModule(new TLRAM(
+    address = AddressSet(0x100, 0xfffeff),
+    beatBytes = 32 // FIXME: hardcoded
+  ))
+
+  xbar.node :=* tensor.node
+  ramA.node := xbar.node
+  ramB.node := xbar.node
+
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with UnitTestModule {
+    tensor.module.io.start := io.start
+    io.finished := tensor.module.io.finished
+  }
+}
+
 // unit test harness
 class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters)
     extends UnitTest(timeout) {
-  val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
+  // val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
+  val dut = Module(LazyModule(new TensorCoreDecoupledTwoTLRAM).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
 }

From e1e3ac8274bd02954ff4d64ad9462ef4a8bb2f1b Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 16 Oct 2024 22:22:27 -0700
Subject: [PATCH 20/47] tensor: Fix busy state

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 154a3cf..652608b 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -128,7 +128,13 @@ class TensorCoreDecoupled(
       )
     }
   }
-  when(io.writeback.fire) {
+
+  // TODO: @perf: Instead of waiting until the last writeback, release busy as
+  // soon as the access frontend is complete so that there's a better chance to
+  // saturate the backend with back-to-back HGMMAs.  This would require sending
+  // the 'wid' register to backend instead of having it shared with the
+  // frontend.
+  when(io.writeback.fire && io.writeback.bits.last) {
     busy := false.B
   }
 

From 8847278ad1d54fe3167f01e0b9f70fcd3dd01096 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 17 Oct 2024 14:37:33 -0700
Subject: [PATCH 21/47] tensor: Instantiate actual DPU

---
 .../radiance/core/TensorCoreDecoupled.scala   | 93 +++++++++++++++----
 1 file changed, 73 insertions(+), 20 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 652608b..b9695ad 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -33,8 +33,9 @@ class TensorCoreDecoupled(
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
+  val wordSizeInBits = wordSize * 8 // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
-  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
+  val dataWidth = numLanes * wordSizeInBits // TODO FP16
   val numFPRegBits = log2Ceil(numFPRegs)
 
   val io = IO(new Bundle {
@@ -45,7 +46,7 @@ class TensorCoreDecoupled(
       val last = Bool()
       val wid = UInt(numWarpBits.W)
       val rd = UInt(numFPRegBits.W)
-      val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W))
+      val data = Vec(numLanes, UInt((wordSizeInBits).W))
     })
     val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
     val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
@@ -223,9 +224,6 @@ class TensorCoreDecoupled(
           io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
-  // FIXME: this need to change to dpu_ready
-  val dpuReady = io.writeback.ready // FIXME: this need be actual dpu
-
   val substepExecute = RegInit(0.U(1.W))
   when (respQueueA.fire) {
     substepExecute := substepExecute + 1.U
@@ -267,7 +265,10 @@ class TensorCoreDecoupled(
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
 
-  val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME?
+  val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid
+  val operandA = fullAQueue.io.deq.bits.data
+  val operandB = respQueueB.bits.data
+  val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
   val substepCompute = RegInit(0.U(1.W))
   when (dpuFire) {
@@ -301,6 +302,66 @@ class TensorCoreDecoupled(
   }
   assertAligned
 
+  // Dot-product unit
+  //
+  // 4x2 four-element DPUs summing up to 32 MACs in total
+  val dpus = Seq.fill(4)(Seq.fill(2)(
+    Module(new TensorDotProductUnit(half = false))
+  ))
+  // operandA is 4x4 in K-major
+  val operandADimensional =
+    operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    .grouped(4).toSeq
+  println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits")
+  println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}")
+  assert(operandADimensional.length == tilingParams.mc &&
+         operandADimensional(0).length == tilingParams.kc,
+         "operand width doesn't agree with tiling parameter")
+  // operandB is 2x4, i.e. 4x2 in N-major
+  val operandBDimensional =
+    operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    .grouped(4).toSeq
+  println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}")
+  val ncSubstep = tilingParams.nc / 2
+  assert(tilingParams.mc * ncSubstep == numLanes,
+         "substep tile size doesn't match writeback throughput")
+  assert(operandBDimensional.length == ncSubstep &&
+         operandBDimensional(0).length == tilingParams.kc,
+         "operand width doesn't agree with tiling parameter")
+
+  for (m <- 0 until tilingParams.mc) {
+    for (n <- 0 until ncSubstep) {
+      dpus(m)(n).io.in.valid := dpuFire
+      dpus(m)(n).io.in.bits.a := operandADimensional(m)
+      dpus(m)(n).io.in.bits.b := operandBDimensional(n)
+      dpus(m)(n).io.in.bits.c := 0.U // FIXME: bogus accum data
+      // dpu ready couples with writeback backpressure
+      dpus(m)(n).io.stall := !io.writeback.ready
+    }
+  }
+  dpuReady := !dpus(0)(0).io.stall
+  dontTouch(dpuFire)
+  dontTouch(dpuReady)
+
+  val dpuValids = dpus.flatMap(_.map(_.io.out.valid))
+  val dpuValid = dpuValids.reduce(_ && _)
+  def assertDPU = {
+    val dpuStalls = dpus.flatMap(_.map(_.io.stall))
+    assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _),
+      "stall signals of DPUs went unaligned")
+    assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _),
+      "valid signals of DPUs went unaligned")
+  }
+  assertDPU
+
+  // flatten DPU output into 1D array in M-major order
+  val flattenedDPUOut = (0 until ncSubstep).flatMap { n =>
+    (0 until tilingParams.mc).map { m =>
+      dpus(m)(n).io.out.bits.data
+    }
+  }
+  io.writeback.bits.data := flattenedDPUOut
+
   def rdGen(set: UInt, step: UInt): UInt = {
     // each step produces 4x4 output tile, written by 8 threads with 2 regs per
     // thread
@@ -309,19 +370,11 @@ class TensorCoreDecoupled(
     // FIXME: add substep here
   }
 
-  io.writeback.valid := operandsValid // FIXME: bypass logic
+  io.writeback.valid := dpuValid
   io.writeback.bits.wid := warpReg
   io.writeback.bits.rd := rdGen(setExecute, stepExecute)
   io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
 
-  // FIXME: debug dummy: pipe A directly to writeback
-  val groupedRespA = respQueueA.bits.data
-                     .asBools.grouped(wordSize * 8/*bits*/)
-                     .map(VecInit(_).asUInt)
-  (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) =>
-    wb := data
-  }
-
   // State transition
   // ----------------
   //
@@ -400,7 +453,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
 
   val tensor = Module(new TensorCoreDecoupled(
                       8, 8, outer.numSrcIds , TensorTilingParams()))
-  val wordSize = 4 // FIXME: hardcoded
+  val wordSize = 4 // @cleanup: hardcoded
 
   val zip = Seq((outer.node.out(0), tensor.io.reqA),
                 (outer.node.out(1), tensor.io.reqB))
@@ -431,7 +484,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
   tlOutB.d.ready := tensor.io.respB.ready
 
   tensor.io.initiate.valid := io.start
-  tensor.io.initiate.bits.wid := 0.U // FIXME
+  tensor.io.initiate.bits.wid := 0.U // TODO
   tensor.io.writeback.ready := true.B
 
   io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last
@@ -443,7 +496,7 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
   val xbar = LazyModule(new TLXbar)
   val ram = LazyModule(new TLRAM(
     address = AddressSet(0x0000, 0xffffff),
-    beatBytes = 32 // FIXME: hardcoded
+    beatBytes = 32 // @cleanup: hardcoded
   ))
 
   ram.node :=* xbar.node :=* tensor.node
@@ -461,11 +514,11 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
   val xbar = LazyModule(new TLXbar)
   val ramA = LazyModule(new TLRAM(
     address = AddressSet(0x000, 0xfffeff),
-    beatBytes = 32 // FIXME: hardcoded
+    beatBytes = 32 // @cleanup: hardcoded
   ))
   val ramB = LazyModule(new TLRAM(
     address = AddressSet(0x100, 0xfffeff),
-    beatBytes = 32 // FIXME: hardcoded
+    beatBytes = 32 // @cleanup: hardcoded
   ))
 
   xbar.node :=* tensor.node

From 7de8e86d4f04712f90c4457940c02a341b721f76 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 17 Oct 2024 15:18:47 -0700
Subject: [PATCH 22/47] tensor: Sync rd with DPU using a queue

---
 .../radiance/core/TensorCoreDecoupled.scala   | 44 ++++++++++++-------
 src/main/scala/radiance/core/TensorDPU.scala  |  2 +-
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index b9695ad..92a6596 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -270,6 +270,8 @@ class TensorCoreDecoupled(
   val operandB = respQueueB.bits.data
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
+  val setCompute = fullAQueue.io.deq.bits.tag.set
+  val stepCompute = fullAQueue.io.deq.bits.tag.step
   val substepCompute = RegInit(0.U(1.W))
   when (dpuFire) {
     substepCompute := substepCompute + 1.U
@@ -348,9 +350,9 @@ class TensorCoreDecoupled(
   def assertDPU = {
     val dpuStalls = dpus.flatMap(_.map(_.io.stall))
     assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _),
-      "stall signals of DPUs went unaligned")
+      "stall signals of DPUs went out of sync")
     assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _),
-      "valid signals of DPUs went unaligned")
+      "valid signals of DPUs went out of sync")
   }
   assertDPU
 
@@ -362,17 +364,36 @@ class TensorCoreDecoupled(
   }
   io.writeback.bits.data := flattenedDPUOut
 
-  def rdGen(set: UInt, step: UInt): UInt = {
+  // Writeback queues
+  // ----------------
+  // These queues hold metadata needed for writeback in sync with the DPU.
+
+  val queueDepth = 4 // needs to be at least the DPU latency
+  val rdQueue = Module(new Queue(
+    chiselTypeOf(io.writeback.bits.rd), queueDepth
+  ))
+  rdQueue.io.enq.valid := dpuFire
+  rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute)
+  rdQueue.io.deq.ready := io.writeback.fire
+  assert(rdQueue.io.enq.ready === true.B,
+         "rd queue full, throttling DPU operation")
+  assert(!dpuValid || rdQueue.io.deq.valid,
+         "rd queue and DPU went out of sync")
+
+  // TODO: decouple wid from frontend
+  // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
+
+  // note rd is independent to sets
+  def rdGen(step: UInt, substep: UInt): UInt = {
     // each step produces 4x4 output tile, written by 8 threads with 2 regs per
     // thread
-    require(numLanes == 8, "currently assumes 8-wide warps")
-    (Cat(set, step) >> 1/*2 regs/thread*/)
-    // FIXME: add substep here
+    (step << 1/*2 substeps*/) + substep
   }
 
   io.writeback.valid := dpuValid
   io.writeback.bits.wid := warpReg
-  io.writeback.bits.rd := rdGen(setExecute, stepExecute)
+  io.writeback.bits.rd := rdQueue.io.deq.bits
+  // FIXME: look at set/step of dpu output not setExecute
   io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
 
   // State transition
@@ -410,15 +431,6 @@ class TensorCoreDecoupled(
       }
     }
   }
-
-  // Writeback queues
-  // ----------------
-  // These queues hold the metadata necessary for register
-  // writeback.
-
-  // val queueDepth = 2
-  // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
-  // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 }
 
 // synthesizable unit tests
diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala
index a82bed7..515b1bf 100644
--- a/src/main/scala/radiance/core/TensorDPU.scala
+++ b/src/main/scala/radiance/core/TensorDPU.scala
@@ -53,7 +53,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
   io.out.bits.data := ieee(box(dpu.io.out.bits.data, S))
 }
 
-// Copied from chisel3.util.Pipe.
+// An implementation of chisel3.util.Pipe that supports stalls.
 class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module {
   /** A non-ambiguous name of this `StallingPipe` for use in generated Verilog
    *  names. Includes the latency cycle count in the name as well as the

From 2741af0b2b36026cfe57ca227eb469d6643d4c12 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 17 Oct 2024 15:43:44 -0700
Subject: [PATCH 23/47] tensor: Keep set/step in the tag writeback queue

---
 .../radiance/core/TensorCoreDecoupled.scala   | 39 +++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 92a6596..3d00c35 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -267,6 +267,7 @@ class TensorCoreDecoupled(
 
   val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid
   val operandA = fullAQueue.io.deq.bits.data
+  val operandATag = fullAQueue.io.deq.bits.tag
   val operandB = respQueueB.bits.data
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
@@ -314,8 +315,6 @@ class TensorCoreDecoupled(
   val operandADimensional =
     operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4).toSeq
-  println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits")
-  println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}")
   assert(operandADimensional.length == tilingParams.mc &&
          operandADimensional(0).length == tilingParams.kc,
          "operand width doesn't agree with tiling parameter")
@@ -323,7 +322,6 @@ class TensorCoreDecoupled(
   val operandBDimensional =
     operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4).toSeq
-  println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}")
   val ncSubstep = tilingParams.nc / 2
   assert(tilingParams.mc * ncSubstep == numLanes,
          "substep tile size doesn't match writeback throughput")
@@ -369,18 +367,20 @@ class TensorCoreDecoupled(
   // These queues hold metadata needed for writeback in sync with the DPU.
 
   val queueDepth = 4 // needs to be at least the DPU latency
-  val rdQueue = Module(new Queue(
-    chiselTypeOf(io.writeback.bits.rd), queueDepth
+  val tagQueue = Module(new Queue(
+    chiselTypeOf(operandATag), queueDepth
   ))
-  rdQueue.io.enq.valid := dpuFire
-  rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute)
-  rdQueue.io.deq.ready := io.writeback.fire
-  assert(rdQueue.io.enq.ready === true.B,
-         "rd queue full, throttling DPU operation")
-  assert(!dpuValid || rdQueue.io.deq.valid,
-         "rd queue and DPU went out of sync")
+  tagQueue.io.enq.valid := dpuFire
+  // A and B should have the same tags
+  tagQueue.io.enq.bits := operandATag
+  // @cleanup: awkward
+  tagQueue.io.enq.bits.substep := substepCompute
+  tagQueue.io.deq.ready := io.writeback.fire
+  assert(tagQueue.io.enq.ready === true.B,
+         "tag queue full, DPU operation might be throttled")
+  assert(!dpuValid || tagQueue.io.deq.valid,
+         "tag queue and DPU went out of sync")
 
-  // TODO: decouple wid from frontend
   // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 
   // note rd is independent to sets
@@ -390,11 +390,14 @@ class TensorCoreDecoupled(
     (step << 1/*2 substeps*/) + substep
   }
 
+  val setWriteback = tagQueue.io.deq.bits.set
+  val stepWriteback = tagQueue.io.deq.bits.step
+  val substepWriteback = tagQueue.io.deq.bits.substep
   io.writeback.valid := dpuValid
+  // TODO: decouple wid from frontend
   io.writeback.bits.wid := warpReg
-  io.writeback.bits.rd := rdQueue.io.deq.bits
-  // FIXME: look at set/step of dpu output not setExecute
-  io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute)
+  io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback)
+  io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback)
 
   // State transition
   // ----------------
@@ -500,6 +503,10 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
   tensor.io.writeback.ready := true.B
 
   io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last
+  when (io.finished) {
+    // might be too strong
+    assert(tensor.io.writeback.bits.rd === 31.U)
+  }
 }
 
 // a minimal Diplomacy graph with a tensor core and a TLRAM

From a2519da58fe1397a7570656a4726f42693e8d845 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 17 Oct 2024 16:36:18 -0700
Subject: [PATCH 24/47] tensor: SMEM address generation

---
 .../radiance/core/TensorCoreDecoupled.scala   | 51 +++++++++++++++++--
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 3d00c35..f7c8547 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -159,6 +159,48 @@ class TensorCoreDecoupled(
   tag.step := stepAccess
   tag.substep := substepAccess
 
+  // @cleanup: generalize in terms of M/N/K-majorness?
+  def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt)
+      : (UInt/*A*/, UInt/*B*/) = {
+    // note that step iterates along N first, then M
+    val numComputeTilesM = tilingParams.m / tilingParams.mc
+    val numComputeTilesN = tilingParams.n / tilingParams.nc
+    val tileM = step % numComputeTilesM.U
+    val tileN = step / numComputeTilesM.U
+    val mcSubstep  = tilingParams.mc / 2
+    val ncSubstep  = tilingParams.nc / 2
+
+    // note that both A and B are K-major to facilitate bank conflict-free SMEM
+    // accesses
+    //
+    // (row,col) coordinate of the compute tile
+    val tileRowA = tileM // M
+    val tileColA = set   // K
+    val tileRowB = tileN // N
+    val tileColB = set   // K
+    // (row,col) coordinate of the starting element of the compute tile
+    val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) +
+                   (substep << log2Ceil(mcSubstep))
+    val elemColA = tileColA << log2Ceil(tilingParams.kc)
+    val elemRowB = tileRowB << log2Ceil(tilingParams.nc)
+                   (substep << log2Ceil(ncSubstep))
+    val elemColB = tileColB << log2Ceil(tilingParams.kc)
+    val rowStrideA = wordSize * tilingParams.k
+    val rowStrideABits = log2Ceil(rowStrideA)
+    val rowStrideB = wordSize * tilingParams.k
+    val rowStrideBBits = log2Ceil(rowStrideB)
+    val wordStrideBits = log2Ceil(wordSize)
+
+    val tileOffsetA = (elemRowA << rowStrideABits) + (elemColA << wordStrideBits)
+    val tileOffsetB = (elemRowB << rowStrideBBits) + (elemColB << wordStrideBits)
+
+    (baseA + tileOffsetA, baseB + tileOffsetB)
+  }
+
+  // FIXME: bogus base address
+  val (addressA, addressB) =
+    addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess)
+
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
@@ -172,9 +214,7 @@ class TensorCoreDecoupled(
       sourceGen.io.gen := req.fire
       sourceGen.io.meta := tag
       req.valid := genReq
-      // FIXME: bogus address
-      // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B
-      req.bits.address := 0.U
+      req.bits.address := (if (i == 0) addressA else addressB)
       req.bits.source := sourceGen.io.id.bits
 
       sourceGen.io.reclaim.valid := resp.fire
@@ -366,7 +406,7 @@ class TensorCoreDecoupled(
   // ----------------
   // These queues hold metadata needed for writeback in sync with the DPU.
 
-  val queueDepth = 4 // needs to be at least the DPU latency
+  val queueDepth = 6 // needs to be at least the DPU latency
   val tagQueue = Module(new Queue(
     chiselTypeOf(operandATag), queueDepth
   ))
@@ -397,7 +437,8 @@ class TensorCoreDecoupled(
   // TODO: decouple wid from frontend
   io.writeback.bits.wid := warpReg
   io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback)
-  io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback)
+  io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) &&
+                            (substepWriteback === 1.U)
 
   // State transition
   // ----------------

From 64ea48ace3681e0a74a732fb4da006717e62b873 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 13:46:04 -0700
Subject: [PATCH 25/47] tensor: Consider data reuse for B memory request

B is reused every 4 steps because of the k->i->j iteration order.
---
 .../radiance/core/TensorCoreDecoupled.scala   | 111 ++++++++++--------
 1 file changed, 62 insertions(+), 49 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index f7c8547..897edb2 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -145,8 +145,6 @@ class TensorCoreDecoupled(
   // Memory traffic generation
   // -------------------------
   //
-  val genReq = (state === TensorState.run)
-
   class TensorMemTag extends Bundle {
     val set = UInt(setBits.W)
     val step = UInt(stepBits.W)
@@ -159,16 +157,14 @@ class TensorCoreDecoupled(
   tag.step := stepAccess
   tag.substep := substepAccess
 
+  val numTilesM = tilingParams.m / tilingParams.mc
+  val numTilesN = tilingParams.n / tilingParams.nc
   // @cleanup: generalize in terms of M/N/K-majorness?
   def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt)
       : (UInt/*A*/, UInt/*B*/) = {
     // note that step iterates along N first, then M
-    val numComputeTilesM = tilingParams.m / tilingParams.mc
-    val numComputeTilesN = tilingParams.n / tilingParams.nc
-    val tileM = step % numComputeTilesM.U
-    val tileN = step / numComputeTilesM.U
-    val mcSubstep  = tilingParams.mc / 2
-    val ncSubstep  = tilingParams.nc / 2
+    val tileM = step % numTilesM.U
+    val tileN = step / numTilesM.U
 
     // note that both A and B are K-major to facilitate bank conflict-free SMEM
     // accesses
@@ -180,11 +176,11 @@ class TensorCoreDecoupled(
     val tileColB = set   // K
     // (row,col) coordinate of the starting element of the compute tile
     val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) +
-                   (substep << log2Ceil(mcSubstep))
-    val elemColA = tileColA << log2Ceil(tilingParams.kc)
-    val elemRowB = tileRowB << log2Ceil(tilingParams.nc)
-                   (substep << log2Ceil(ncSubstep))
-    val elemColB = tileColB << log2Ceil(tilingParams.kc)
+                    (substep << log2Ceil(tilingParams.mc / 2))
+    val elemColA =  tileColA << log2Ceil(tilingParams.kc)
+    val elemRowB = (tileRowB << log2Ceil(tilingParams.nc)) +
+                    (substep << log2Ceil(tilingParams.nc / 2))
+    val elemColB =  tileColB << log2Ceil(tilingParams.kc)
     val rowStrideA = wordSize * tilingParams.k
     val rowStrideABits = log2Ceil(rowStrideA)
     val rowStrideB = wordSize * tilingParams.k
@@ -201,6 +197,13 @@ class TensorCoreDecoupled(
   val (addressA, addressB) =
     addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess)
 
+  val genReqA = (state === TensorState.run)
+  val numTilesMBits = log2Ceil(numTilesM)
+  // generate B request at every 4 steps.  B achieves reuse through outer
+  // product so it doesn't require access at every step
+  val shouldFireB = (stepAccess & ((1 << numTilesMBits) - 1).U) === 0.U
+  val genReqB = (state === TensorState.run) && shouldFireB
+
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
@@ -213,7 +216,7 @@ class TensorCoreDecoupled(
 
       sourceGen.io.gen := req.fire
       sourceGen.io.meta := tag
-      req.valid := genReq
+      req.valid := (if (i == 0) genReqA else genReqB)
       req.bits.address := (if (i == 0) addressA else addressB)
       req.bits.source := sourceGen.io.id.bits
 
@@ -228,23 +231,27 @@ class TensorCoreDecoupled(
     }
   }
 
-  // only advance to the next step if we fired mem requests for both A and B
-  // TODO: @perf: too strict? should be able to have A and B progress
-  // separately
-  val firedABReg = RegInit(VecInit(false.B, false.B))
-  val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map {
-    case (req, fired) => { when (req.fire) { fired := true.B } }
-    req.fire
-  })
-  val firedAB = (firedABNow.asUInt | firedABReg.asUInt)
-  val nextSubstepAccess = firedAB.andR
+  // only advance to the next step if we fired mem requests for both A and B.
+  // also consider that B doesn't have to be fired every time due to reuse.
+  // @perf: too strict? should be able to have A and B progress separately
+  val firedAReg = RegInit(false.B)
+  val firedBReg = RegInit(false.B)
+  when (io.reqA.fire) { firedAReg := true.B }
+  when (io.reqB.fire) { firedBReg := true.B }
+  val firedANow = io.reqA.fire
+  val firedBNow = io.reqB.fire
+  val firedA = firedAReg || firedANow
+  val firedB = firedBReg || firedBNow
+  val nextSubstepAccess = firedA && (!shouldFireB || firedB)
   val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U)
   // clear out firedABReg every substep
   when (nextSubstepAccess) {
-    firedABReg := Seq(false.B, false.B)
+    firedAReg := false.B
+    firedBReg := false.B
     substepAccess := substepAccess + 1.U
   }
   require(substepAccess.widthOption.get == 1, "there should be only two substeps")
+  dontTouch(shouldFireB)
 
   // Execute stage
   // -------------
@@ -327,18 +334,26 @@ class TensorCoreDecoupled(
   respQueueA.ready := MuxCase(false.B,
                               Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
                                   (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
-  respQueueB.ready := dpuFire
+  // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
+  // we fully iterated a column (M-dimension).
+  val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
+  val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask
+  respQueueB.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
+  dontTouch(shouldDequeueB)
 
-  // assert that the DPU is computing with operands of the same set/step
+  // Assert that the DPU is computing with operands of the same set/step. Note
+  // that the B resp will only have step values multiple of 4 due to reuse.
   //
-  // this assumes that memory responses come back in-order.  this might be too
-  // strong an assumption depending on the backing memory
+  // This check assumes that memory responses come back in-order.  Might be too
+  // strong of an assumption depending on the backing memory.
   def assertAligned = {
+    val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
       assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) &&
-             (fullAQueue.io.deq.bits.tag.step === respQueueB.bits.tag.step),
+             ((fullAQueue.io.deq.bits.tag.step & stepMask) ===
+              (respQueueB.bits.tag.step & stepMask)),
         "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")
     }
@@ -348,26 +363,26 @@ class TensorCoreDecoupled(
   // Dot-product unit
   //
   // 4x2 four-element DPUs summing up to 32 MACs in total
-  val dpus = Seq.fill(4)(Seq.fill(2)(
+  val ncSubstep = tilingParams.nc / 2
+  val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)(
     Module(new TensorDotProductUnit(half = false))
   ))
   // operandA is 4x4 in K-major
   val operandADimensional =
     operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-    .grouped(4).toSeq
-  assert(operandADimensional.length == tilingParams.mc &&
-         operandADimensional(0).length == tilingParams.kc,
-         "operand width doesn't agree with tiling parameter")
-  // operandB is 2x4, i.e. 4x2 in N-major
+    .grouped(4/*k-dim*/).toSeq
+  require(operandADimensional.length == tilingParams.mc &&
+          operandADimensional(0).length == tilingParams.kc,
+          "operand width doesn't agree with tiling parameter")
+  // operandB is 2x4 in K-major
   val operandBDimensional =
     operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-    .grouped(4).toSeq
-  val ncSubstep = tilingParams.nc / 2
-  assert(tilingParams.mc * ncSubstep == numLanes,
-         "substep tile size doesn't match writeback throughput")
-  assert(operandBDimensional.length == ncSubstep &&
-         operandBDimensional(0).length == tilingParams.kc,
-         "operand width doesn't agree with tiling parameter")
+    .grouped(4/*k-dim*/).toSeq
+  require(tilingParams.mc * ncSubstep == numLanes,
+          "substep tile size doesn't match writeback throughput")
+  require(operandBDimensional.length == ncSubstep &&
+          operandBDimensional(0).length == tilingParams.kc,
+          "operand width doesn't agree with tiling parameter")
 
   for (m <- 0 until tilingParams.mc) {
     for (n <- 0 until ncSubstep) {
@@ -406,10 +421,8 @@ class TensorCoreDecoupled(
   // ----------------
   // These queues hold metadata needed for writeback in sync with the DPU.
 
-  val queueDepth = 6 // needs to be at least the DPU latency
-  val tagQueue = Module(new Queue(
-    chiselTypeOf(operandATag), queueDepth
-  ))
+  val queueDepth = 5 // needs to be at least the DPU latency
+  val tagQueue = Module(new Queue(chiselTypeOf(operandATag), queueDepth))
   tagQueue.io.enq.valid := dpuFire
   // A and B should have the same tags
   tagQueue.io.enq.bits := operandATag
@@ -573,11 +586,11 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
   val tensor = LazyModule(new TensorCoreDecoupledTL)
   val xbar = LazyModule(new TLXbar)
   val ramA = LazyModule(new TLRAM(
-    address = AddressSet(0x000, 0xfffeff),
+    address = AddressSet(0x000, 0xfffbff),
     beatBytes = 32 // @cleanup: hardcoded
   ))
   val ramB = LazyModule(new TLRAM(
-    address = AddressSet(0x100, 0xfffeff),
+    address = AddressSet(0x400, 0xfffbff),
     beatBytes = 32 // @cleanup: hardcoded
   ))
 

From c2f39f74749df7fac8ba63d8900d6651eea72f71 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 16:21:43 -0700
Subject: [PATCH 26/47] tensor: Rename substepExecute

---
 .../radiance/core/TensorCoreDecoupled.scala    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 897edb2..f7c6c63 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -269,13 +269,13 @@ class TensorCoreDecoupled(
 
   require(respQueueA.bits.data.widthOption.get ==
           io.writeback.bits.data.widthOption.get,
-    "response data width does not match the writeback data width")
+          "response data width does not match the writeback data width")
 
-  val substepExecute = RegInit(0.U(1.W))
+  val substepDeqA = RegInit(0.U(1.W))
   when (respQueueA.fire) {
-    substepExecute := substepExecute + 1.U
+    substepDeqA := substepDeqA + 1.U
   }
-  dontTouch(substepExecute)
+  dontTouch(substepDeqA)
 
   // Do pipelining for the A operand so that we obtain the full 4x4 A tile
   // ready for compute.  The pipeline is two-stage:
@@ -292,7 +292,7 @@ class TensorCoreDecoupled(
   val halfAQueue = Module(new Queue(
     chiselTypeOf(respQueueA.bits), entries = 1, pipe = true
   ))
-  halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U)
+  halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U)
   halfAQueue.io.enq.bits := respQueueA.bits
 
   // substep == 0 data goes to the LSB
@@ -305,9 +305,9 @@ class TensorCoreDecoupled(
     new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
   ))
   // hold first half A data for the first substep
-  halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) &&
+  halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) &&
                              fullAQueue.io.enq.ready
-  fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) &&
+  fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) &&
                              halfAQueue.io.deq.valid
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
@@ -332,8 +332,8 @@ class TensorCoreDecoupled(
   // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
   // on the substep
   respQueueA.ready := MuxCase(false.B,
-                              Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
-                                  (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
+                              Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready,
+                                  (substepDeqA === 1.U) -> fullAQueue.io.enq.ready))
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U

From 91d9897c277a1f0ab4e678bdd91f21eef7ac380d Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 17:17:41 -0700
Subject: [PATCH 27/47] tensor: Write FillBuffer for tile buffering

---
 .../radiance/core/TensorCoreDecoupled.scala   | 50 +++++++++++++++++--
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index f7c6c63..e70e59f 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -5,6 +5,7 @@ package radiance.core
 
 import chisel3._
 import chisel3.util._
+import chisel3.experimental.requireIsChiselType
 import org.chipsalliance.cde.config.Parameters
 import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
 import freechips.rocketchip.tilelink._
@@ -312,10 +313,17 @@ class TensorCoreDecoupled(
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
 
-  val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid
+  val fillBufB = Module(new FillBuffer(
+    chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
+  ))
+  fillBufB.io.enq.valid := respQueueB.valid
+  fillBufB.io.enq.bits := respQueueB.bits.data
+  respQueueB.ready := fillBufB.io.enq.ready
+
+  val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid
   val operandA = fullAQueue.io.deq.bits.data
   val operandATag = fullAQueue.io.deq.bits.tag
-  val operandB = respQueueB.bits.data
+  val operandB = fillBufB.io.deq.bits
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
   val setCompute = fullAQueue.io.deq.bits.tag.set
@@ -338,7 +346,7 @@ class TensorCoreDecoupled(
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
   val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask
-  respQueueB.ready := dpuFire && shouldDequeueB
+  fillBufB.io.deq.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
   dontTouch(shouldDequeueB)
@@ -375,8 +383,11 @@ class TensorCoreDecoupled(
           operandADimensional(0).length == tilingParams.kc,
           "operand width doesn't agree with tiling parameter")
   // operandB is 2x4 in K-major
+  // val operandBDimensional =
+  //   operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+  //   .grouped(4/*k-dim*/).toSeq
   val operandBDimensional =
-    operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4/*k-dim*/).toSeq
   require(tilingParams.mc * ncSubstep == numLanes,
           "substep tile size doesn't match writeback throughput")
@@ -490,6 +501,37 @@ class TensorCoreDecoupled(
   }
 }
 
+// A buffer that collects multiple entries of input data and exposes the
+// coalesced data as output.  Effectively acts as a width-widening
+// chisel.util.Pipe.
+class FillBuffer[T <: Data](
+  gen: T,
+  entries: Int
+) extends Module {
+  require(entries > 0, "FillBuffer must have a positive number of entries")
+  requireIsChiselType(gen)
+
+  val io = IO(new Bundle {
+    val enq = Flipped(Decoupled(gen))
+    val deq = Decoupled(Vec(entries, gen))
+  })
+
+  val data = Reg(Vec(entries, gen))
+  val ptr = Counter(entries + 1)
+  val full = (ptr.value === entries.U)
+  io.enq.ready := !full
+  when (io.enq.fire) {
+    data(ptr.value) := io.enq.bits
+    ptr.inc()
+  }
+  io.deq.valid := full
+  (io.deq.bits zip data).foreach { case (io, d) => io := d }
+  when (io.deq.fire) {
+    assert(ptr.value === entries.U, "FillBuffer fired before buffer was full")
+    ptr.reset()
+  }
+}
+
 // synthesizable unit tests
 
 // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy

From 7fab6f89ad3e99de20e4aa5972be745c720b1e70 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 17:33:55 -0700
Subject: [PATCH 28/47] tensor: Properly route FillBuffer to DPU

---
 .../radiance/core/TensorCoreDecoupled.scala   | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index e70e59f..206250e 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -313,17 +313,24 @@ class TensorCoreDecoupled(
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
 
-  val fillBufB = Module(new FillBuffer(
+  // serialize every two B responses into one full 4x4 B tile
+  // FIXME: do the same for A
+  val fullB = Module(new FillBuffer(
     chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
   ))
-  fillBufB.io.enq.valid := respQueueB.valid
-  fillBufB.io.enq.bits := respQueueB.bits.data
-  respQueueB.ready := fillBufB.io.enq.ready
+  fullB.io.enq.valid := respQueueB.valid
+  fullB.io.enq.bits := respQueueB.bits.data
+  respQueueB.ready := fullB.io.enq.ready
+  val fullBTag = Module(new Queue(
+    new TensorMemTag, entries = 1, pipe = true
+  ))
+  fullBTag.io.enq.valid := respQueueB.valid
+  fullBTag.io.enq.bits := respQueueB.bits.tag
 
-  val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid
+  val operandsValid = fullAQueue.io.deq.valid && fullB.io.deq.valid
   val operandA = fullAQueue.io.deq.bits.data
   val operandATag = fullAQueue.io.deq.bits.tag
-  val operandB = fillBufB.io.deq.bits
+  val operandB = fullB.io.deq.bits
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
   val setCompute = fullAQueue.io.deq.bits.tag.set
@@ -333,10 +340,6 @@ class TensorCoreDecoupled(
     substepCompute := substepCompute + 1.U
   }
 
-  // hold full A until two-cycle compute is done
-  fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
-  val nextStepExecute = dpuFire && (substepCompute === 1.U)
-
   // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
   // on the substep
   respQueueA.ready := MuxCase(false.B,
@@ -345,12 +348,19 @@ class TensorCoreDecoupled(
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
-  val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask
-  fillBufB.io.deq.ready := dpuFire && shouldDequeueB
+  val shouldDequeueB =
+    ((stepExecute & shouldDequeueBMask) === shouldDequeueBMask) &&
+    (substepCompute === 1.U)
+  fullB.io.deq.ready := dpuFire && shouldDequeueB
+  fullBTag.io.deq.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
   dontTouch(shouldDequeueB)
 
+  // hold full A until two-cycle compute is done
+  fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
+  val nextStepExecute = dpuFire && (substepCompute === 1.U)
+
   // Assert that the DPU is computing with operands of the same set/step. Note
   // that the B resp will only have step values multiple of 4 due to reuse.
   //
@@ -359,9 +369,9 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) &&
+      assert((fullAQueue.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) &&
              ((fullAQueue.io.deq.bits.tag.step & stepMask) ===
-              (respQueueB.bits.tag.step & stepMask)),
+              (fullBTag.io.deq.bits.step & stepMask)),
         "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")
     }
@@ -387,7 +397,7 @@ class TensorCoreDecoupled(
   //   operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
   //   .grouped(4/*k-dim*/).toSeq
   val operandBDimensional =
-    operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4/*k-dim*/).toSeq
   require(tilingParams.mc * ncSubstep == numLanes,
           "substep tile size doesn't match writeback throughput")

From c4b5a11fdefbbfbe73b765bb1feece25d2a1d3f1 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 19:54:20 -0700
Subject: [PATCH 29/47] tensor: Replace staging logic for A with FillBuffer

---
 .../radiance/core/TensorCoreDecoupled.scala   | 77 ++++++++-----------
 1 file changed, 34 insertions(+), 43 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 206250e..deb4dc1 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -272,46 +272,41 @@ class TensorCoreDecoupled(
           io.writeback.bits.data.widthOption.get,
           "response data width does not match the writeback data width")
 
+  // FIXME: unnecessary
   val substepDeqA = RegInit(0.U(1.W))
   when (respQueueA.fire) {
     substepDeqA := substepDeqA + 1.U
   }
   dontTouch(substepDeqA)
 
-  // Do pipelining for the A operand so that we obtain the full 4x4 A tile
-  // ready for compute.  The pipeline is two-stage:
-  //   - stage one (halfAQueue) for assembling the full A tile from half-tiles
-  //     coming from the resp queue, and
-  //   - stage two (fullAQueue) for holding the full A tile until it gets
-  //     matched with two 4x2 B tiles, and compute is complete.
-  //
-  // Note that the half-tile assembly is unnecessary for B since the B tile is
-  // only 4x2.
-  // Also send the set/step tag along the pipe for alignment check.
+  // Stage the operands in a pipeline so that we obtain the full 4x4 tiles
+  // ready for compute.  Also send the set/step tag along the pipe for
+  // alignment check.
 
-  // note combinationally coupled ready with `pipe`
-  val halfAQueue = Module(new Queue(
-    chiselTypeOf(respQueueA.bits), entries = 1, pipe = true
+  val fullA = Module(new FillBuffer(
+    chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
   ))
-  halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U)
-  halfAQueue.io.enq.bits := respQueueA.bits
+  fullA.io.enq.valid := respQueueA.valid
+  fullA.io.enq.bits := respQueueA.bits.data
+  respQueueA.ready := fullA.io.enq.ready
+  // `pipe` combinationally couples enq-deq ready
+  val fullATag = Module(new Queue(
+    new TensorMemTag, entries = 1, pipe = true
+  ))
+  fullATag.io.enq.valid := respQueueA.valid
+  fullATag.io.enq.bits := respQueueA.bits.tag
 
-  // substep == 0 data goes to the LSB
-  val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits.data)
-  require(fullAEnqData.widthOption.get == dataWidth * 2,
-          "assumes 2-cycle read for a full compute tile of A")
-  // only use the lower halfA's tag.  substep will be incorrect.
-  val fullAEnqTag = halfAQueue.io.deq.bits.tag
-  val fullAQueue = Module(new Queue(
+  // stage the full A tile once more so that FillBuffer can be filled up in the
+  // background while the tile is being used for compute.  This does come with
+  // capacity overhead.
+  val fullABuf = Module(new Queue(
     new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
   ))
-  // hold first half A data for the first substep
-  halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) &&
-                             fullAQueue.io.enq.ready
-  fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) &&
-                             halfAQueue.io.deq.valid
-  fullAQueue.io.enq.bits.data := fullAEnqData
-  fullAQueue.io.enq.bits.tag := fullAEnqTag
+  fullABuf.io.enq.valid := fullA.io.deq.valid
+  fullABuf.io.enq.bits.data := fullA.io.deq.bits.asUInt
+  fullABuf.io.enq.bits.tag := fullATag.io.deq.bits
+  fullA.io.deq.ready := fullABuf.io.enq.ready
+  fullATag.io.deq.ready := fullABuf.io.enq.ready
 
   // serialize every two B responses into one full 4x4 B tile
   // FIXME: do the same for A
@@ -327,29 +322,24 @@ class TensorCoreDecoupled(
   fullBTag.io.enq.valid := respQueueB.valid
   fullBTag.io.enq.bits := respQueueB.bits.tag
 
-  val operandsValid = fullAQueue.io.deq.valid && fullB.io.deq.valid
-  val operandA = fullAQueue.io.deq.bits.data
-  val operandATag = fullAQueue.io.deq.bits.tag
+  val operandsValid = fullABuf.io.deq.valid && fullB.io.deq.valid
+  val operandA = fullABuf.io.deq.bits.data
+  val operandATag = fullABuf.io.deq.bits.tag
   val operandB = fullB.io.deq.bits
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
-  val setCompute = fullAQueue.io.deq.bits.tag.set
-  val stepCompute = fullAQueue.io.deq.bits.tag.step
+  val setCompute = fullABuf.io.deq.bits.tag.set
+  val stepCompute = fullABuf.io.deq.bits.tag.step
   val substepCompute = RegInit(0.U(1.W))
   when (dpuFire) {
     substepCompute := substepCompute + 1.U
   }
 
-  // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
-  // on the substep
-  respQueueA.ready := MuxCase(false.B,
-                              Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready,
-                                  (substepDeqA === 1.U) -> fullAQueue.io.enq.ready))
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
   val shouldDequeueB =
-    ((stepExecute & shouldDequeueBMask) === shouldDequeueBMask) &&
+    ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) &&
     (substepCompute === 1.U)
   fullB.io.deq.ready := dpuFire && shouldDequeueB
   fullBTag.io.deq.ready := dpuFire && shouldDequeueB
@@ -358,7 +348,8 @@ class TensorCoreDecoupled(
   dontTouch(shouldDequeueB)
 
   // hold full A until two-cycle compute is done
-  fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
+  fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U)
+  // FIXME: this should be nextStepCompute
   val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
   // Assert that the DPU is computing with operands of the same set/step. Note
@@ -369,8 +360,8 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert((fullAQueue.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) &&
-             ((fullAQueue.io.deq.bits.tag.step & stepMask) ===
+      assert((fullABuf.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) &&
+             ((fullABuf.io.deq.bits.tag.step & stepMask) ===
               (fullBTag.io.deq.bits.step & stepMask)),
         "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")

From 93c9bcc32f5b516f3bd51990ff60e22e0348f409 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 20:12:15 -0700
Subject: [PATCH 30/47] tensor: Stage B as well for full throughput

---
 .../radiance/core/TensorCoreDecoupled.scala   | 41 ++++++++++++-------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index deb4dc1..90cb785 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -300,10 +300,13 @@ class TensorCoreDecoupled(
   // background while the tile is being used for compute.  This does come with
   // capacity overhead.
   val fullABuf = Module(new Queue(
-    new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
+    new Bundle {
+      val data = chiselTypeOf(fullA.io.deq.bits)
+      val tag = new TensorMemTag
+    }, entries = 1, pipe = true
   ))
   fullABuf.io.enq.valid := fullA.io.deq.valid
-  fullABuf.io.enq.bits.data := fullA.io.deq.bits.asUInt
+  fullABuf.io.enq.bits.data := fullA.io.deq.bits
   fullABuf.io.enq.bits.tag := fullATag.io.deq.bits
   fullA.io.deq.ready := fullABuf.io.enq.ready
   fullATag.io.deq.ready := fullABuf.io.enq.ready
@@ -322,10 +325,22 @@ class TensorCoreDecoupled(
   fullBTag.io.enq.valid := respQueueB.valid
   fullBTag.io.enq.bits := respQueueB.bits.tag
 
-  val operandsValid = fullABuf.io.deq.valid && fullB.io.deq.valid
+  val fullBBuf = Module(new Queue(
+    new Bundle {
+      val data = chiselTypeOf(fullB.io.deq.bits)
+      val tag = new TensorMemTag
+    }, entries = 1, pipe = true
+  ))
+  fullBBuf.io.enq.valid := fullB.io.deq.valid
+  fullBBuf.io.enq.bits.data := fullB.io.deq.bits
+  fullBBuf.io.enq.bits.tag := fullBTag.io.deq.bits
+  fullB.io.deq.ready := fullBBuf.io.enq.ready
+  fullBTag.io.deq.ready := fullBBuf.io.enq.ready
+
+  val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid
   val operandA = fullABuf.io.deq.bits.data
   val operandATag = fullABuf.io.deq.bits.tag
-  val operandB = fullB.io.deq.bits
+  val operandB = fullBBuf.io.deq.bits.data
   val dpuReady = Wire(Bool())
   val dpuFire = operandsValid && dpuReady
   val setCompute = fullABuf.io.deq.bits.tag.set
@@ -335,20 +350,19 @@ class TensorCoreDecoupled(
     substepCompute := substepCompute + 1.U
   }
 
+  // hold full A until two-cycle compute is done
+  fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U)
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
   val shouldDequeueB =
     ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) &&
     (substepCompute === 1.U)
-  fullB.io.deq.ready := dpuFire && shouldDequeueB
-  fullBTag.io.deq.ready := dpuFire && shouldDequeueB
+  fullBBuf.io.deq.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
   dontTouch(shouldDequeueB)
 
-  // hold full A until two-cycle compute is done
-  fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U)
   // FIXME: this should be nextStepCompute
   val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
@@ -360,9 +374,9 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert((fullABuf.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) &&
+      assert((fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set) &&
              ((fullABuf.io.deq.bits.tag.step & stepMask) ===
-              (fullBTag.io.deq.bits.step & stepMask)),
+              (fullBBuf.io.deq.bits.tag.step & stepMask)),
         "A and B operands are pointing to different set/steps. " ++
         "This might indicate memory response coming back out-of-order.")
     }
@@ -378,15 +392,12 @@ class TensorCoreDecoupled(
   ))
   // operandA is 4x4 in K-major
   val operandADimensional =
-    operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+    operandA.asUInt.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4/*k-dim*/).toSeq
   require(operandADimensional.length == tilingParams.mc &&
           operandADimensional(0).length == tilingParams.kc,
           "operand width doesn't agree with tiling parameter")
-  // operandB is 2x4 in K-major
-  // val operandBDimensional =
-  //   operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-  //   .grouped(4/*k-dim*/).toSeq
+  // select 2x4 subtile out of operandB that is 4x4 in K-major
   val operandBDimensional =
     operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
     .grouped(4/*k-dim*/).toSeq

From c0292dd0aa97a8cec3d034b20e2a167f78e54af8 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 21:51:34 -0700
Subject: [PATCH 31/47] tensor: Enlarge operand buffer for A for better SMEM
 reuse

---
 .../radiance/core/TensorCoreDecoupled.scala   | 159 +++++++++++-------
 1 file changed, 100 insertions(+), 59 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 90cb785..fa3f6e9 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -146,18 +146,6 @@ class TensorCoreDecoupled(
   // Memory traffic generation
   // -------------------------
   //
-  class TensorMemTag extends Bundle {
-    val set = UInt(setBits.W)
-    val step = UInt(stepBits.W)
-    val substep = UInt(1.W)
-  }
-  // use concatenation of set/step as the memory request source.  This will get
-  // translated to the actual TL sourcewidth in sourceGen.
-  val tag = Wire(new TensorMemTag)
-  tag.set := setAccess
-  tag.step := stepAccess
-  tag.substep := substepAccess
-
   val numTilesM = tilingParams.m / tilingParams.mc
   val numTilesN = tilingParams.n / tilingParams.nc
   // @cleanup: generalize in terms of M/N/K-majorness?
@@ -198,12 +186,41 @@ class TensorCoreDecoupled(
   val (addressA, addressB) =
     addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess)
 
+  // 'index' is the index of a memory request among the sequence of requests
+  // needed to read a full M-column of A or N-row of B.  Its range is [0,m/2)
+  // or [0,n/2), where 2 is the stride can be read in a single request size.
+  require(tilingParams.m == tilingParams.n,
+          "currently only supports square SMEM tile")
+  val numIndices = tilingParams.m / 2
+  val indexBits = log2Ceil(numIndices)
+  val lastIndex = (1 << indexBits) - 1
+
+  class TensorMemTag extends Bundle {
+    val set = UInt(setBits.W)
+    val index = UInt(indexBits.W)
+  }
+
+  val tagInit = Wire(new TensorMemTag)
+  tagInit.set := 0.U
+  tagInit.index := 0.U
+  val tagA = RegInit(tagInit)
+  val tagB = RegInit(tagInit)
+
+  when (io.reqA.fire) {
+    when (tagA.index === lastIndex.U) {
+      tagA.set := tagA.set + 1.U
+    }
+    tagA.index := tagA.index + 1.U
+  }
+  when (io.reqB.fire) {
+    when (tagB.index === lastIndex.U) {
+      tagB.set := tagB.set + 1.U
+    }
+    tagB.index := tagB.index + 1.U
+  }
+
   val genReqA = (state === TensorState.run)
-  val numTilesMBits = log2Ceil(numTilesM)
-  // generate B request at every 4 steps.  B achieves reuse through outer
-  // product so it doesn't require access at every step
-  val shouldFireB = (stepAccess & ((1 << numTilesMBits) - 1).U) === 0.U
-  val genReqB = (state === TensorState.run) && shouldFireB
+  val genReqB = (state === TensorState.run)
 
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
@@ -212,11 +229,11 @@ class TensorCoreDecoupled(
     case ((req, (resp, respTagged)), i) => {
       val sourceGen = Module(new SourceGenerator(
         log2Ceil(numSourceIds),
-        metadata = Some(tag)
+        metadata = Some(new TensorMemTag)
       ))
 
       sourceGen.io.gen := req.fire
-      sourceGen.io.meta := tag
+      sourceGen.io.meta := (if (i == 0) tagA else tagB)
       req.valid := (if (i == 0) genReqA else genReqB)
       req.bits.address := (if (i == 0) addressA else addressB)
       req.bits.source := sourceGen.io.id.bits
@@ -243,7 +260,7 @@ class TensorCoreDecoupled(
   val firedBNow = io.reqB.fire
   val firedA = firedAReg || firedANow
   val firedB = firedBReg || firedBNow
-  val nextSubstepAccess = firedA && (!shouldFireB || firedB)
+  val nextSubstepAccess = firedA && firedB
   val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U)
   // clear out firedABReg every substep
   when (nextSubstepAccess) {
@@ -252,17 +269,12 @@ class TensorCoreDecoupled(
     substepAccess := substepAccess + 1.U
   }
   require(substepAccess.widthOption.get == 1, "there should be only two substeps")
-  dontTouch(shouldFireB)
 
   // Execute stage
   // -------------
   // Backend of the decoupled access/execute pipeline.
   //
   // set and step being currently executed in the acc/ex backend
-  val setExecute = RegInit(0.U(setBits.W))
-  val stepExecute = RegInit(0.U(stepBits.W))
-  dontTouch(setExecute)
-  dontTouch(stepExecute)
 
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
@@ -283,8 +295,10 @@ class TensorCoreDecoupled(
   // ready for compute.  Also send the set/step tag along the pipe for
   // alignment check.
 
+  // @cleanup: dedup A and B below
+
   val fullA = Module(new FillBuffer(
-    chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
+    chiselTypeOf(respQueueB.bits.data), numIndices
   ))
   fullA.io.enq.valid := respQueueA.valid
   fullA.io.enq.bits := respQueueA.bits.data
@@ -337,23 +351,48 @@ class TensorCoreDecoupled(
   fullB.io.deq.ready := fullBBuf.io.enq.ready
   fullBTag.io.deq.ready := fullBBuf.io.enq.ready
 
-  val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid
-  val operandA = fullABuf.io.deq.bits.data
-  val operandATag = fullABuf.io.deq.bits.tag
-  val operandB = fullBBuf.io.deq.bits.data
   val dpuReady = Wire(Bool())
+  val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid
   val dpuFire = operandsValid && dpuReady
-  val setCompute = fullABuf.io.deq.bits.tag.set
-  val stepCompute = fullABuf.io.deq.bits.tag.step
+
+  val setCompute = RegInit(0.U(setBits.W))
+  val stepCompute = RegInit(0.U(stepBits.W))
   val substepCompute = RegInit(0.U(1.W))
+  val nextStepCompute = dpuFire && (substepCompute === 1.U)
+  dontTouch(setCompute)
+  dontTouch(stepCompute)
+  dontTouch(substepCompute)
   when (dpuFire) {
     substepCompute := substepCompute + 1.U
   }
 
-  // hold full A until two-cycle compute is done
-  fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U)
-  // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
-  // we fully iterated a column (M-dimension).
+  // Operand selection
+  //
+  // select the correct 4x4 tile from A operand buffer
+  val numTilesMBits = log2Ceil(numTilesM)
+  def selectOperandA(buf: Vec[UInt]): UInt = {
+    require(buf.length == numIndices)
+    val stepM = stepCompute & ((1 << numTilesMBits) - 1).U
+    Cat(buf((stepM << 1) + 1.U), buf(stepM << 1))
+  }
+  val operandA = selectOperandA(fullABuf.io.deq.bits.data)
+  val operandATag = fullABuf.io.deq.bits.tag
+  // select the correct 2x4 tile from B operand buffer
+  val operandB = fullBBuf.io.deq.bits.data(substepCompute)
+  val operandBTag = fullBBuf.io.deq.bits.tag
+  dontTouch(operandATag)
+  dontTouch(operandBTag)
+
+  // Operand buffer dequeue logic
+  //
+  // hold A data until the entire set is done
+  val shouldDequeueAMask = ((1 << stepBits) - 1).U
+  val shouldDequeueA =
+    ((stepCompute & shouldDequeueAMask) === shouldDequeueAMask) &&
+    (substepCompute === 1.U)
+  fullABuf.io.deq.ready := dpuFire && shouldDequeueA
+  // hold B tile at respQueueB for multiple steps for reuse, only dequeue when
+  // we fully iterated a column (M-dimension)
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
   val shouldDequeueB =
     ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) &&
@@ -361,11 +400,9 @@ class TensorCoreDecoupled(
   fullBBuf.io.deq.ready := dpuFire && shouldDequeueB
   dontTouch(respQueueA)
   dontTouch(respQueueB)
+  dontTouch(shouldDequeueA)
   dontTouch(shouldDequeueB)
 
-  // FIXME: this should be nextStepCompute
-  val nextStepExecute = dpuFire && (substepCompute === 1.U)
-
   // Assert that the DPU is computing with operands of the same set/step. Note
   // that the B resp will only have step values multiple of 4 due to reuse.
   //
@@ -374,11 +411,9 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert((fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set) &&
-             ((fullABuf.io.deq.bits.tag.step & stepMask) ===
-              (fullBBuf.io.deq.bits.tag.step & stepMask)),
-        "A and B operands are pointing to different set/steps. " ++
-        "This might indicate memory response coming back out-of-order.")
+      assert(fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set,
+             "A and B operands are pointing to different sets. " ++
+             "This might indicate memory response coming back out-of-order.")
     }
   }
   assertAligned
@@ -386,23 +421,24 @@ class TensorCoreDecoupled(
   // Dot-product unit
   //
   // 4x2 four-element DPUs summing up to 32 MACs in total
+  //
   val ncSubstep = tilingParams.nc / 2
+  require(tilingParams.mc * ncSubstep == numLanes,
+          "substep tile size doesn't match writeback throughput")
   val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)(
     Module(new TensorDotProductUnit(half = false))
   ))
-  // operandA is 4x4 in K-major
-  val operandADimensional =
-    operandA.asUInt.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-    .grouped(4/*k-dim*/).toSeq
+
+  // reshape operands for easier routing to DPU
+  def reshapeByFourWords(x: UInt): Seq[Seq[UInt]] = {
+    x.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+     .grouped(4/*k-dim*/).toSeq
+  }
+  val operandADimensional = reshapeByFourWords(operandA)
   require(operandADimensional.length == tilingParams.mc &&
           operandADimensional(0).length == tilingParams.kc,
           "operand width doesn't agree with tiling parameter")
-  // select 2x4 subtile out of operandB that is 4x4 in K-major
-  val operandBDimensional =
-    operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
-    .grouped(4/*k-dim*/).toSeq
-  require(tilingParams.mc * ncSubstep == numLanes,
-          "substep tile size doesn't match writeback throughput")
+  val operandBDimensional = reshapeByFourWords(operandB)
   require(operandBDimensional.length == ncSubstep &&
           operandBDimensional(0).length == tilingParams.kc,
           "operand width doesn't agree with tiling parameter")
@@ -444,12 +480,17 @@ class TensorCoreDecoupled(
   // ----------------
   // These queues hold metadata needed for writeback in sync with the DPU.
 
+  class TensorComputeTag extends Bundle {
+    val set = UInt(setBits.W)
+    val step = UInt(stepBits.W)
+    val substep = UInt(1.W)
+  }
+
   val queueDepth = 5 // needs to be at least the DPU latency
-  val tagQueue = Module(new Queue(chiselTypeOf(operandATag), queueDepth))
+  val tagQueue = Module(new Queue(new TensorComputeTag, queueDepth))
   tagQueue.io.enq.valid := dpuFire
-  // A and B should have the same tags
-  tagQueue.io.enq.bits := operandATag
-  // @cleanup: awkward
+  tagQueue.io.enq.bits.set := setCompute
+  tagQueue.io.enq.bits.step := stepCompute
   tagQueue.io.enq.bits.substep := substepCompute
   tagQueue.io.deq.ready := io.writeback.fire
   assert(tagQueue.io.enq.ready === true.B,
@@ -490,7 +531,7 @@ class TensorCoreDecoupled(
     }
   }
   sequenceSetStep(setAccess, stepAccess, nextStepAccess)
-  sequenceSetStep(setExecute, stepExecute, nextStepExecute)
+  sequenceSetStep(setCompute, stepCompute, nextStepCompute)
 
   switch(state) {
     is(TensorState.idle) {

From 0aadc6074ad32fccb13118f8e0915d8b76f2a267 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 22:42:41 -0700
Subject: [PATCH 32/47] tensor: Decouple A and B access states

Get rid of set/stepAccess states and let A and B access progress
independently.
---
 .../radiance/core/TensorCoreDecoupled.scala   | 200 ++++++++----------
 1 file changed, 88 insertions(+), 112 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index fa3f6e9..ed241b5 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -82,15 +82,6 @@ class TensorCoreDecoupled(
   // This drives the overall pipeline of memory requests, dot-product unit
   // operations and regfile writeback.
 
-  object TensorState extends ChiselEnum {
-    val idle = Value(0.U)
-    val run = Value(1.U)
-    // All set/step sequencing is complete and the tensor core is holding the
-    // result data until downstream writeback is ready.
-    // FIXME: is this necessary if writeback is decoupled with queues?
-    val finish = Value(2.U)
-  }
-  val state = RegInit(TensorState.idle)
   val busy = RegInit(false.B)
   // Holds the warp id the core is currently working on.  Note that we only
   // support one outstanding warp request
@@ -107,22 +98,10 @@ class TensorCoreDecoupled(
   def setDone(set: UInt) = (set === lastSet.U)
   def stepDone(step: UInt) = (step === lastStep.U)
 
-  // set and step being currently accessed in the acc/ex frontend
-  val setAccess = RegInit(0.U(setBits.W))
-  val stepAccess = RegInit(0.U(stepBits.W))
-  // we need full 4x4 A tile to fire DPU, but since the memory width is 8
-  // words, we need 2 cycles to read A.  `substep` tells which cycle we're at.
-  val substepAccess = RegInit(0.U(1.W))
-  dontTouch(setAccess)
-  dontTouch(stepAccess)
-  dontTouch(substepAccess)
-
-  when(io.initiate.fire) {
+  when (io.initiate.fire) {
     val wid = io.initiate.bits.wid
     busy := true.B
     warpReg := wid
-    setAccess := 0.U
-    stepAccess := 0.U
     when(io.writeback.fire) {
       assert(
         io.writeback.bits.wid =/= wid,
@@ -143,55 +122,51 @@ class TensorCoreDecoupled(
   // serialize every HGMMA request
   io.initiate.ready := !busy
 
-  // Memory traffic generation
-  // -------------------------
+  // ===========================================================================
+  // Access stage
+  // ===========================================================================
   //
-  val numTilesM = tilingParams.m / tilingParams.mc
-  val numTilesN = tilingParams.n / tilingParams.nc
-  // @cleanup: generalize in terms of M/N/K-majorness?
-  def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt)
-      : (UInt/*A*/, UInt/*B*/) = {
-    // note that step iterates along N first, then M
-    val tileM = step % numTilesM.U
-    val tileN = step / numTilesM.U
+  // Frontend of the decoupled access/execute pipeline.
 
-    // note that both A and B are K-major to facilitate bank conflict-free SMEM
-    // accesses
-    //
-    // (row,col) coordinate of the compute tile
-    val tileRowA = tileM // M
-    val tileColA = set   // K
-    val tileRowB = tileN // N
-    val tileColB = set   // K
-    // (row,col) coordinate of the starting element of the compute tile
-    val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) +
-                    (substep << log2Ceil(tilingParams.mc / 2))
-    val elemColA =  tileColA << log2Ceil(tilingParams.kc)
-    val elemRowB = (tileRowB << log2Ceil(tilingParams.nc)) +
-                    (substep << log2Ceil(tilingParams.nc / 2))
-    val elemColB =  tileColB << log2Ceil(tilingParams.kc)
-    val rowStrideA = wordSize * tilingParams.k
-    val rowStrideABits = log2Ceil(rowStrideA)
-    val rowStrideB = wordSize * tilingParams.k
-    val rowStrideBBits = log2Ceil(rowStrideB)
-    val wordStrideBits = log2Ceil(wordSize)
-
-    val tileOffsetA = (elemRowA << rowStrideABits) + (elemColA << wordStrideBits)
-    val tileOffsetB = (elemRowB << rowStrideBBits) + (elemColB << wordStrideBits)
-
-    (baseA + tileOffsetA, baseB + tileOffsetB)
+  // States
+  //
+  object AccessorState extends ChiselEnum {
+    val idle = Value(0.U)
+    val access = Value(1.U)
+    // All set/step sequencing is complete and the tensor core is holding the
+    // result data until downstream writeback is ready.
+    // FIXME: is this necessary if writeback is decoupled with queues?
+    val finish = Value(2.U)
   }
+  val state = RegInit(AccessorState.idle)
+  val allReqsDone = WireInit(false.B)
+  dontTouch(allReqsDone)
 
-  // FIXME: bogus base address
-  val (addressA, addressB) =
-    addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess)
+  switch(state) {
+    is(AccessorState.idle) {
+      when(io.initiate.fire) {
+        state := AccessorState.access
+      }
+    }
+    is(AccessorState.access) {
+      when (allReqsDone) {
+        state := AccessorState.finish
+      }
+    }
+    is(AccessorState.finish) {
+      // FIXME: decouple writeback
+      when(io.writeback.fire) {
+        state := AccessorState.idle
+      }
+    }
+  }
 
   // 'index' is the index of a memory request among the sequence of requests
   // needed to read a full M-column of A or N-row of B.  Its range is [0,m/2)
   // or [0,n/2), where 2 is the stride can be read in a single request size.
   require(tilingParams.m == tilingParams.n,
           "currently only supports square SMEM tile")
-  val numIndices = tilingParams.m / 2
+  val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/
   val indexBits = log2Ceil(numIndices)
   val lastIndex = (1 << indexBits) - 1
 
@@ -219,9 +194,51 @@ class TensorCoreDecoupled(
     tagB.index := tagB.index + 1.U
   }
 
-  val genReqA = (state === TensorState.run)
-  val genReqB = (state === TensorState.run)
+  // Address generation
+  //
+  def addressGen(base: UInt, set: UInt, index: UInt): UInt = {
+    // note that both A and B are K-major to facilitate bank conflict-free SMEM
+    // accesses, so that below code applies to both.
+    //
+    // (row,col) coordinate of the compute tile
+    val tileRow = index
+    val tileCol = set
+    // (row,col) coordinate of the starting element of the compute tile
+    val elemRow = index << 1
+    val elemCol =  tileCol << log2Ceil(tilingParams.kc)
+    val rowStride = tilingParams.k * wordSize
+    val rowStrideBits = log2Ceil(rowStride)
+    val wordStrideBits = log2Ceil(wordSize)
+    val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits)
 
+    base + tileOffset
+  }
+
+  // FIXME: bogus base address
+  val addressA = addressGen(0.U, tagA.set, tagA.index)
+  val addressB = addressGen(0.U, tagB.set, tagB.index)
+
+  val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U)
+  val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U)
+  val doneReqA = RegInit(false.B)
+  val doneReqB = RegInit(false.B)
+  when (lastReqA && io.reqA.fire) { doneReqA := true.B }
+  when (lastReqB && io.reqB.fire) { doneReqB := true.B }
+  val genReqA = (state === AccessorState.access) && !doneReqA
+  val genReqB = (state === AccessorState.access) && !doneReqA
+  when (state === AccessorState.finish) {
+    doneReqA := false.B
+    doneReqB := false.B
+    tagA.set := 0.U
+    tagA.index := 0.U
+    tagB.set := 0.U
+    tagB.index := 0.U
+  }
+
+  allReqsDone := doneReqA && doneReqB
+
+  // Request generation
+  //
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
@@ -249,34 +266,13 @@ class TensorCoreDecoupled(
     }
   }
 
-  // only advance to the next step if we fired mem requests for both A and B.
-  // also consider that B doesn't have to be fired every time due to reuse.
-  // @perf: too strict? should be able to have A and B progress separately
-  val firedAReg = RegInit(false.B)
-  val firedBReg = RegInit(false.B)
-  when (io.reqA.fire) { firedAReg := true.B }
-  when (io.reqB.fire) { firedBReg := true.B }
-  val firedANow = io.reqA.fire
-  val firedBNow = io.reqB.fire
-  val firedA = firedAReg || firedANow
-  val firedB = firedBReg || firedBNow
-  val nextSubstepAccess = firedA && firedB
-  val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U)
-  // clear out firedABReg every substep
-  when (nextSubstepAccess) {
-    firedAReg := false.B
-    firedBReg := false.B
-    substepAccess := substepAccess + 1.U
-  }
-  require(substepAccess.widthOption.get == 1, "there should be only two substeps")
-
+  // ===========================================================================
   // Execute stage
-  // -------------
+  // ===========================================================================
+  //
   // Backend of the decoupled access/execute pipeline.
   //
-  // set and step being currently executed in the acc/ex backend
-
-  val respQueueDepth = 4 // FIXME: parameterize
+  val respQueueDepth = 8 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
 
@@ -369,6 +365,7 @@ class TensorCoreDecoupled(
   // Operand selection
   //
   // select the correct 4x4 tile from A operand buffer
+  val numTilesM = tilingParams.m / tilingParams.mc
   val numTilesMBits = log2Ceil(numTilesM)
   def selectOperandA(buf: Vec[UInt]): UInt = {
     require(buf.length == numIndices)
@@ -383,7 +380,7 @@ class TensorCoreDecoupled(
   dontTouch(operandATag)
   dontTouch(operandBTag)
 
-  // Operand buffer dequeue logic
+  // Operand buffer logic
   //
   // hold A data until the entire set is done
   val shouldDequeueAMask = ((1 << stepBits) - 1).U
@@ -476,8 +473,8 @@ class TensorCoreDecoupled(
   }
   io.writeback.bits.data := flattenedDPUOut
 
-  // Writeback queues
-  // ----------------
+  // Writeback logic
+  //
   // These queues hold metadata needed for writeback in sync with the DPU.
 
   class TensorComputeTag extends Bundle {
@@ -530,28 +527,7 @@ class TensorCoreDecoupled(
       }
     }
   }
-  sequenceSetStep(setAccess, stepAccess, nextStepAccess)
   sequenceSetStep(setCompute, stepCompute, nextStepCompute)
-
-  switch(state) {
-    is(TensorState.idle) {
-      when(io.initiate.fire) {
-        state := TensorState.run
-      }
-    }
-    is(TensorState.run) {
-      when (setDone(setAccess) && stepDone(stepAccess) && nextStepAccess) {
-        when (state === TensorState.run) {
-          state := TensorState.finish
-        }
-      }
-    }
-    is(TensorState.finish) {
-      when(io.writeback.fire) {
-        state := TensorState.idle
-      }
-    }
-  }
 }
 
 // A buffer that collects multiple entries of input data and exposes the

From e946403d7863abaec61b4baa92da467617d3fe66 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 22:54:48 -0700
Subject: [PATCH 33/47] tensor: Fix typo, reduce resp queue depth

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index ed241b5..b899ce9 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -225,7 +225,7 @@ class TensorCoreDecoupled(
   when (lastReqA && io.reqA.fire) { doneReqA := true.B }
   when (lastReqB && io.reqB.fire) { doneReqB := true.B }
   val genReqA = (state === AccessorState.access) && !doneReqA
-  val genReqB = (state === AccessorState.access) && !doneReqA
+  val genReqB = (state === AccessorState.access) && !doneReqB
   when (state === AccessorState.finish) {
     doneReqA := false.B
     doneReqB := false.B
@@ -272,7 +272,7 @@ class TensorCoreDecoupled(
   //
   // Backend of the decoupled access/execute pipeline.
   //
-  val respQueueDepth = 8 // FIXME: parameterize
+  val respQueueDepth = 2 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
 

From b3c328b1be7bf924fdddc285fcf5181d8f55c6cf Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 18 Oct 2024 23:11:19 -0700
Subject: [PATCH 34/47] tensor: Assert minimum response queue depth with doc

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index b899ce9..cd3bfa4 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -272,7 +272,13 @@ class TensorCoreDecoupled(
   //
   // Backend of the decoupled access/execute pipeline.
   //
-  val respQueueDepth = 2 // FIXME: parameterize
+  val respQueueDepth = 4 // FIXME: parameterize
+  require(respQueueDepth >= 4,
+    "respQueueDepth must be at least 4.  This is because the B operand buffer " ++
+    "is shallower than A's, so the B response queue has to be deep enough to " ++
+    "hold younger requests until A operand buffer becomes valid and the first DPU " ++
+    "fire can happen.  FIXME: make operand buffer report per-subtile valid so " ++
+    "the first compute can happen earlier.")
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
 
@@ -547,6 +553,7 @@ class FillBuffer[T <: Data](
 
   val data = Reg(Vec(entries, gen))
   val ptr = Counter(entries + 1)
+  dontTouch(ptr.value)
   val full = (ptr.value === entries.U)
   io.enq.ready := !full
   when (io.enq.fire) {

From a98cb32343810994737e60446c7b0c5d975a6f37 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 21 Oct 2024 21:56:36 -0700
Subject: [PATCH 35/47] tensor: Inject stalls to A ram for fuzzing

---
 .../radiance/core/TensorCoreDecoupled.scala   | 26 +++++++++++++++++--
 .../scala/radiance/memory/Coalescing.scala    |  3 ++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index cd3bfa4..c53ab81 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -216,7 +216,7 @@ class TensorCoreDecoupled(
 
   // FIXME: bogus base address
   val addressA = addressGen(0.U, tagA.set, tagA.index)
-  val addressB = addressGen(0.U, tagB.set, tagB.index)
+  val addressB = addressGen(0x400.U, tagB.set, tagB.index)
 
   val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U)
   val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U)
@@ -672,14 +672,36 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
     beatBytes = 32 // @cleanup: hardcoded
   ))
 
+  val stutter = new TLIdentityNode
   xbar.node :=* tensor.node
-  ramA.node := xbar.node
+  ramA.node := stutter := xbar.node
   ramB.node := xbar.node
 
+  val fuzz = true
+
   lazy val module = new Impl
   class Impl extends LazyModuleImp(this) with UnitTestModule {
     tensor.module.io.start := io.start
     io.finished := tensor.module.io.finished
+
+    val (tlIn, _) = stutter.in(0)
+    val (tlOut, _) = stutter.out(0)
+    require(stutter.in.length == 1)
+    require(stutter.out.length == 1)
+
+    // inject stalls for fuzzing
+    val incr = Wire(Bool())
+    val (count, _) = Counter(incr, 0x1000)
+    def cond(x: UInt) = (x & ((1 << 3) - 1).U) =/= 0.U
+    val stall = if (fuzz) cond(count) else false.B
+
+    tlOut.a <> tlIn.a
+    tlIn.d <> tlOut.d
+    incr := tlIn.a.fire || stall
+    when (stall) {
+      tlIn.a.ready := false.B
+      tlOut.a.valid := false.B
+    }
   }
 }
 
diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala
index cac5e95..a21daee 100644
--- a/src/main/scala/radiance/memory/Coalescing.scala
+++ b/src/main/scala/radiance/memory/Coalescing.scala
@@ -372,7 +372,8 @@ class SourceGenerator[T <: Data](
       outstanding := outstanding + 1.U
     }
   }.elsewhen(io.reclaim.valid) {
-    assert(outstanding > 0.U)
+    assert(outstanding > 0.U,
+           "Over-reclaim. Did some responses get dropped?")
     outstanding := outstanding - 1.U
   }
   dontTouch(outstanding)

From 408888ae8f0f05364412ddba8246d9adf7502f87 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 21 Oct 2024 22:38:29 -0700
Subject: [PATCH 36/47] tensor: addPath()s for hopper generated chisel

FIXME: SourceGenerator has a name-clash.
---
 src/main/scala/radiance/tile/VortexCore.scala | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala
index d6561e3..26c6989 100644
--- a/src/main/scala/radiance/tile/VortexCore.scala
+++ b/src/main/scala/radiance/tile/VortexCore.scala
@@ -128,7 +128,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
         "NUM_THREADS" -> tile.numLsuLanes
       )
     )
-    with HasBlackBoxResource {
+    with HasBlackBoxResource with HasBlackBoxPath {
   // addResource("/vsrc/vortex/hw/unit_tests/generic_queue/testbench.v")
   // addResource("/vsrc/vortex/hw/unit_tests/VX_divide_tb.v")
   // addResource("/vsrc/vortex/hw/syn/synopsys/models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0_rtl.v")
@@ -398,6 +398,34 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
   addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv")
   addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv")
 //  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh")
+  def addHopperTensorCore = {
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRawFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRecFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/DotProductPipe.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/metadataTable_4x5.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/MulFullRawFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/occupancyTable_4x1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorMemTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue4_TensorMemRespWithTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue5_TensorComputeTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_4x261.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_5x7.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is26_oe8_os24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is47_oe8_os24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundRawFNToRecFN_e8_s24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SimpleTimer.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SourceGenerator.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_2.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorCoreDecoupled.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorDotProductUnit.sv")
+  }
+  addHopperTensorCore
   addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv")
   addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv")
   addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv")

From 0fe2b3b07e5a5210cdb1cb5f92f68596b92ff6fb Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 21 Oct 2024 22:39:28 -0700
Subject: [PATCH 37/47] Bump vortex

---
 src/main/resources/vsrc/vortex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex
index 4dcbc31..0f06afc 160000
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
@@ -1 +1 @@
-Subproject commit 4dcbc31a88915fff35ccefd00c6e753fa5ef135a
+Subproject commit 0f06afc3ef7350e82c008f5f25395abf89879213

From e705e8557fda3a3af2765cf8477853e5ca078c92 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 14:32:53 -0700
Subject: [PATCH 38/47] Fake tensor core at RadianceTile for Verilog
 unique-ification

---
 src/main/scala/radiance/tile/RadianceTile.scala | 10 ++++++++++
 src/main/scala/radiance/tile/VortexCore.scala   |  3 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala
index 36aef41..18ed1d1 100644
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -379,6 +379,12 @@ class RadianceTile private (
     tlMasterXbar.node :=* AddressOrNode(base) :=* dcacheNode
   }
 
+  // Instantiate a fake TensorCoreDecoupled module to force unique-ification of
+  // module names in the Chisel-generated Verilog.  This should be disabled for
+  // synthesis runs
+  val tensor = LazyModule(new radiance.core.TensorCoreDecoupledTL)
+  tlMasterXbar.node :=* tensor.node
+
   /* below are copied from rocket */
 
   val tile_master_blocker =
@@ -839,6 +845,10 @@ class RadianceTileModuleImp(outer: RadianceTile)
   // TODO: generalize for useVxCache
   if (!outer.radianceParams.useVxCache) {}
 
+  // connect io.start and io.finish of the fake TensorCoreDecoupled module to
+  // prevent optimize-out
+  outer.tensor.module.io.start := true.B
+
   // // RoCC
   // if (outer.roccs.size > 0) {
   //   val (respArb, cmdRouter) = {
diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala
index 9ad4be0..a24dc02 100644
--- a/src/main/scala/radiance/tile/VortexCore.scala
+++ b/src/main/scala/radiance/tile/VortexCore.scala
@@ -242,8 +242,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
   // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv")
   // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv")
 
-  addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv")
-
   addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv")
   // addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv")
   // addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv")
@@ -408,6 +406,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
   // tensor core
   addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv")
   addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv")
+  addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv")
 //  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh")
   addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv")
   addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv")

From c613341a778e1a31ffbb39cef31b79f95825ff70 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 15:02:55 -0700
Subject: [PATCH 39/47] Disable addPath for old verilog; Deassert valid for
 tensor core

There's an uncaught TL source bug when the core is busy, which doesn't
really need to be fixed with this.
---
 src/main/scala/radiance/tile/RadianceTile.scala | 5 ++---
 src/main/scala/radiance/tile/VortexCore.scala   | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala
index 18ed1d1..0dbc3bd 100644
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -845,9 +845,8 @@ class RadianceTileModuleImp(outer: RadianceTile)
   // TODO: generalize for useVxCache
   if (!outer.radianceParams.useVxCache) {}
 
-  // connect io.start and io.finish of the fake TensorCoreDecoupled module to
-  // prevent optimize-out
-  outer.tensor.module.io.start := true.B
+  // connect io.start and io.finish of the fake TensorCoreDecoupled module
+  outer.tensor.module.io.start := false.B
 
   // // RoCC
   // if (outer.roccs.size > 0) {
diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala
index ea0a16c..fccfb88 100644
--- a/src/main/scala/radiance/tile/VortexCore.scala
+++ b/src/main/scala/radiance/tile/VortexCore.scala
@@ -435,7 +435,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
     addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorCoreDecoupled.sv")
     addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorDotProductUnit.sv")
   }
-  addHopperTensorCore
+  // addHopperTensorCore
   addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv")
   addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv")
   addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv")

From 8818fc92034aa4a5bdcdc6380a7e1b00903b46ad Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 16:26:08 -0700
Subject: [PATCH 40/47] tensor: Fix tagWidth for tensor mem io

---
 src/main/scala/radiance/tile/RadianceTile.scala | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala
index 0dbc3bd..2e235cf 100644
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -739,7 +739,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
       }
     }
 
-    def connectTc {
+    def connectTensor = {
       val tcb0 = new {
         val addr = core.io.tc_a_bits_address(31, 0)
         val tag = core.io.tc_a_bits_tag(3, 0)
@@ -758,16 +758,18 @@ class RadianceTileModuleImp(outer: RadianceTile)
         val adapter = Module(
           new VortexTLAdapter(
             outer.smemSourceWidth,
-            new VortexBundleA(tagWidth = 1, dataWidth = 32 * 8),
-            new VortexBundleD(tagWidth = 1, dataWidth = 32 * 8),
+            new VortexBundleA(tagWidth = 4, dataWidth = 32 * 8),
+            new VortexBundleD(tagWidth = 4, dataWidth = 32 * 8),
             client
           )
         )
+        require(adapter.io.inReq.bits.source.widthOption.get == bundle.tag.widthOption.get)
+        require(adapter.io.inReq.bits.address.widthOption.get == bundle.addr.widthOption.get)
         adapter.io.inReq.bits <> DontCare
         adapter.io.inReq.valid := bundle.aValid
         adapter.io.inReq.bits.address := bundle.addr
         adapter.io.inReq.bits.source := bundle.tag
-        adapter.io.inReq.bits.size := 5.U
+        adapter.io.inReq.bits.size := 5.U // 256 bits
         adapter.io.inReq.bits.opcode := TLMessages.Get
         adapter.io.inReq.bits.mask := x"ffffffff".U
         adapter.io.inResp.ready := bundle.dReady
@@ -780,6 +782,8 @@ class RadianceTileModuleImp(outer: RadianceTile)
       core.io.tc_d_valid := Cat(adapters.last.io.inResp.valid, adapters.head.io.inResp.valid)
       core.io.tc_d_bits_data := Cat(adapters.last.io.inResp.bits.data, adapters.head.io.inResp.bits.data)
       core.io.tc_d_bits_tag := Cat(adapters.last.io.inResp.bits.source, adapters.head.io.inResp.bits.source)
+      require(core.io.tc_d_bits_data.widthOption.get == adapters.head.io.inResp.bits.data.widthOption.get * 2)
+      require(core.io.tc_d_bits_tag.widthOption.get == adapters.head.io.inResp.bits.source.widthOption.get * 2)
     }
 
     def connectBarrier = {
@@ -796,7 +800,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
       outer.barrierMasterNode.out(0)._1.resp.ready := true.B
     }
 
-    def connectAccelerator: Unit = {
+    def connectAccelerator = {
       outer.accMasterNode.out.head._1.cmd.bits := core.io.acc_write_out
       outer.accMasterNode.out.head._1.cmd.valid := core.io.acc_write_en
       core.io.acc_read_in := outer.accMasterNode.out.head._1.status
@@ -837,7 +841,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
     connectImem
     connectDmem
     connectSmem
-    connectTc
+    connectTensor
     connectBarrier
     connectAccelerator
   }

From 54ce0f7c34083a1a59a88118f74539d16bc68142 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 17:08:38 -0700
Subject: [PATCH 41/47] tensor: Increase numSourceId to 16 to match
 RadianceTile

---
 src/main/scala/radiance/core/TensorCoreDecoupled.scala | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index c53ab81..040fa4a 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -573,20 +573,20 @@ class FillBuffer[T <: Data](
 // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy
 // graph.
 class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
-  val numSrcIds = 4
+  val numSourceIds = 16
 
   // node with two edges; one for A and one for B matrix
   val node = TLClientNode(Seq(
     TLMasterPortParameters.v2(
       Seq(TLMasterParameters.v2(
         name = "TensorCoreDecoupledMatrixANode",
-        sourceId = IdRange(0, numSrcIds)
+        sourceId = IdRange(0, numSourceIds)
       ))
     ),
     TLMasterPortParameters.v2(
       Seq(TLMasterParameters.v2(
         name = "TensorCoreDecoupledMatrixBNode",
-        sourceId = IdRange(0, numSrcIds)
+        sourceId = IdRange(0, numSourceIds)
       ))
     )
   ))
@@ -599,7 +599,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
   require(outer.node.out.length == 2/*A and B*/)
 
   val tensor = Module(new TensorCoreDecoupled(
-                      8, 8, outer.numSrcIds , TensorTilingParams()))
+                      8, 8, outer.numSourceIds , TensorTilingParams()))
   val wordSize = 4 // @cleanup: hardcoded
 
   val zip = Seq((outer.node.out(0), tensor.io.reqA),

From b566748bcb2b823c380f2101ab65714db15f8dde Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 17:09:21 -0700
Subject: [PATCH 42/47] tensor: Address gen for block-wise contiguous layout

Necessary to meet 32B-alignment requirement for SMEM.
---
 .../radiance/core/TensorCoreDecoupled.scala   | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 040fa4a..ff7f94c 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -200,22 +200,30 @@ class TensorCoreDecoupled(
     // note that both A and B are K-major to facilitate bank conflict-free SMEM
     // accesses, so that below code applies to both.
     //
-    // (row,col) coordinate of the compute tile
-    val tileRow = index
-    val tileCol = set
-    // (row,col) coordinate of the starting element of the compute tile
-    val elemRow = index << 1
-    val elemCol =  tileCol << log2Ceil(tilingParams.kc)
-    val rowStride = tilingParams.k * wordSize
-    val rowStrideBits = log2Ceil(rowStride)
-    val wordStrideBits = log2Ceil(wordSize)
-    val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits)
+    // a "block" is the 4*8 byte-sized contiguous memory that can be read in
+    // one SMEM request.  The A and B matrix is assumed to be stored in
+    // block-wise "index"-major order (M-major for A, N-major for B)
+    val blockRow = set
+    val blockCol = index
+    val blockIndex = (blockRow << indexBits) + blockCol
+    val blockSize = numLanes * wordSize
+    val blockSizeBits = log2Ceil(blockSize)
+    val byteOffset = blockIndex << blockSizeBits
+    base + byteOffset
 
-    base + tileOffset
+    // address generation for byte-wise K-major A and B layout
+    // val elemRow = blockRow << 1
+    // val elemCol =  blockCol << log2Ceil(tilingParams.kc)
+    // val rowStride = tilingParams.k * wordSize
+    // val rowStrideBits = log2Ceil(rowStride)
+    // val wordStrideBits = log2Ceil(wordSize)
+    // val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits)
+    // base + tileOffset
   }
 
   // FIXME: bogus base address
   val addressA = addressGen(0.U, tagA.set, tagA.index)
+  // SMEM 256KB, 8 banks: 0x8000B(32KB) per bank
   val addressB = addressGen(0x400.U, tagB.set, tagB.index)
 
   val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U)

From 85eb5e334ff54624587b1869dac3dbcb0f0d1b94 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 17:47:54 -0700
Subject: [PATCH 43/47] Bump vortex

---
 src/main/resources/vsrc/vortex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex
index 32ccdee..3abaaff 160000
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
@@ -1 +1 @@
-Subproject commit 32ccdeef0154ca9bd747d1a5c2d6d0203e80caf2
+Subproject commit 3abaaff16ffe4deaf7a44043eab6da92e8afe91b

From 0a682fb6eff633a638d89c63ce06b786c552b517 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 17:55:14 -0700
Subject: [PATCH 44/47] tensor: dontTouch TensorDPU io

Prevents bits.c from being optimized out and set to Z in
TensorCoreDecoupled.
---
 src/main/scala/radiance/core/TensorDPU.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala
index 515b1bf..db98b36 100644
--- a/src/main/scala/radiance/core/TensorDPU.scala
+++ b/src/main/scala/radiance/core/TensorDPU.scala
@@ -33,6 +33,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
       val data = Bits((outFLen).W)
     })
   })
+  dontTouch(io)
 
   // [IEEE] -> recode() -> unbox() -> [Hardfloat] -> box() -> ieee() -> [IEEE]
   // make sure recoding/uncoding happens only at the edge, not at every

From 072904a82be7b6754a347d5a055a45b1ea477ac7 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 22:06:24 -0700
Subject: [PATCH 45/47] Bump vortex

---
 src/main/resources/vsrc/vortex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex
index 3abaaff..78df981 160000
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
@@ -1 +1 @@
-Subproject commit 3abaaff16ffe4deaf7a44043eab6da92e8afe91b
+Subproject commit 78df981366778e394e4db62bfdc14c916ddc9f62

From 95ecc5180fae4a462fc4280b82ff83d6e4f9c65e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 22:44:33 -0700
Subject: [PATCH 46/47] tensor: Decouple warp in execute from access

This allows the access stage to accept new initiate back-to-back without
waiting for the previous writeback to finish.
---
 .../radiance/core/TensorCoreDecoupled.scala   | 73 ++++++++++++-------
 1 file changed, 47 insertions(+), 26 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index ff7f94c..ae763c6 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -85,7 +85,7 @@ class TensorCoreDecoupled(
   val busy = RegInit(false.B)
   // Holds the warp id the core is currently working on.  Note that we only
   // support one outstanding warp request
-  val warpReg = RegInit(0.U(numWarpBits.W))
+  val warpAccess = RegInit(0.U(numWarpBits.W))
 
   // sets: k iteration
   val numSets = (tilingParams.k / tilingParams.kc)
@@ -101,7 +101,7 @@ class TensorCoreDecoupled(
   when (io.initiate.fire) {
     val wid = io.initiate.bits.wid
     busy := true.B
-    warpReg := wid
+    warpAccess := wid
     when(io.writeback.fire) {
       assert(
         io.writeback.bits.wid =/= wid,
@@ -170,28 +170,35 @@ class TensorCoreDecoupled(
   val indexBits = log2Ceil(numIndices)
   val lastIndex = (1 << indexBits) - 1
 
+  class State extends Bundle {
+    val set = UInt(setBits.W)
+    val index = UInt(indexBits.W)
+  }
   class TensorMemTag extends Bundle {
+    val warp = UInt(numWarpBits.W)
     val set = UInt(setBits.W)
     val index = UInt(indexBits.W)
   }
 
-  val tagInit = Wire(new TensorMemTag)
-  tagInit.set := 0.U
-  tagInit.index := 0.U
-  val tagA = RegInit(tagInit)
-  val tagB = RegInit(tagInit)
+  val stateInit = Wire(new State)
+  stateInit.set := 0.U
+  stateInit.index := 0.U
+  val stateA = RegInit(stateInit)
+  val stateB = RegInit(stateInit)
+  dontTouch(stateA)
+  dontTouch(stateB)
 
   when (io.reqA.fire) {
-    when (tagA.index === lastIndex.U) {
-      tagA.set := tagA.set + 1.U
+    when (stateA.index === lastIndex.U) {
+      stateA.set := stateA.set + 1.U
     }
-    tagA.index := tagA.index + 1.U
+    stateA.index := stateA.index + 1.U
   }
   when (io.reqB.fire) {
-    when (tagB.index === lastIndex.U) {
-      tagB.set := tagB.set + 1.U
+    when (stateB.index === lastIndex.U) {
+      stateB.set := stateB.set + 1.U
     }
-    tagB.index := tagB.index + 1.U
+    stateB.index := stateB.index + 1.U
   }
 
   // Address generation
@@ -222,12 +229,12 @@ class TensorCoreDecoupled(
   }
 
   // FIXME: bogus base address
-  val addressA = addressGen(0.U, tagA.set, tagA.index)
+  val addressA = addressGen(0.U, stateA.set, stateA.index)
   // SMEM 256KB, 8 banks: 0x8000B(32KB) per bank
-  val addressB = addressGen(0x400.U, tagB.set, tagB.index)
+  val addressB = addressGen(0x8000.U, stateB.set, stateB.index)
 
-  val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U)
-  val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U)
+  val lastReqA = (stateA.set === lastSet.U) && (stateA.index === lastIndex.U)
+  val lastReqB = (stateB.set === lastSet.U) && (stateB.index === lastIndex.U)
   val doneReqA = RegInit(false.B)
   val doneReqB = RegInit(false.B)
   when (lastReqA && io.reqA.fire) { doneReqA := true.B }
@@ -237,16 +244,25 @@ class TensorCoreDecoupled(
   when (state === AccessorState.finish) {
     doneReqA := false.B
     doneReqB := false.B
-    tagA.set := 0.U
-    tagA.index := 0.U
-    tagB.set := 0.U
-    tagB.index := 0.U
+    stateA.set := 0.U
+    stateA.index := 0.U
+    stateB.set := 0.U
+    stateB.index := 0.U
   }
 
   allReqsDone := doneReqA && doneReqB
 
   // Request generation
   //
+  val tagA = Wire(new TensorMemTag)
+  tagA.warp := warpAccess
+  tagA.set := stateA.set
+  tagA.index := stateA.index
+  val tagB = Wire(new TensorMemTag)
+  tagB.warp := warpAccess
+  tagB.set := stateB.set
+  tagB.index := stateB.index
+
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
@@ -422,9 +438,12 @@ class TensorCoreDecoupled(
   def assertAligned = {
     val stepMask = (1 << numTilesMBits).U
     when (dpuFire) {
-      assert(fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set,
-             "A and B operands are pointing to different sets. " ++
+      assert(operandATag.warp === operandBTag.warp &&
+             operandATag.set  === operandBTag.set,
+             "A and B operands are pointing to different warps and sets. " ++
              "This might indicate memory response coming back out-of-order.")
+      assert(operandATag.set === setCompute,
+             "Operand arrived from memory is pointing at a different set than the FSM.")
     }
   }
   assertAligned
@@ -492,6 +511,7 @@ class TensorCoreDecoupled(
   // These queues hold metadata needed for writeback in sync with the DPU.
 
   class TensorComputeTag extends Bundle {
+    val warp = UInt(numWarpBits.W)
     val set = UInt(setBits.W)
     val step = UInt(stepBits.W)
     val substep = UInt(1.W)
@@ -500,6 +520,7 @@ class TensorCoreDecoupled(
   val queueDepth = 5 // needs to be at least the DPU latency
   val tagQueue = Module(new Queue(new TensorComputeTag, queueDepth))
   tagQueue.io.enq.valid := dpuFire
+  tagQueue.io.enq.bits.warp := operandATag.warp
   tagQueue.io.enq.bits.set := setCompute
   tagQueue.io.enq.bits.step := stepCompute
   tagQueue.io.enq.bits.substep := substepCompute
@@ -518,12 +539,12 @@ class TensorCoreDecoupled(
     (step << 1/*2 substeps*/) + substep
   }
 
+  val warpWriteback = tagQueue.io.deq.bits.warp
   val setWriteback = tagQueue.io.deq.bits.set
   val stepWriteback = tagQueue.io.deq.bits.step
   val substepWriteback = tagQueue.io.deq.bits.substep
   io.writeback.valid := dpuValid
-  // TODO: decouple wid from frontend
-  io.writeback.bits.wid := warpReg
+  io.writeback.bits.wid := warpWriteback
   io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback)
   io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) &&
                             (substepWriteback === 1.U)
@@ -685,7 +706,7 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
   ramA.node := stutter := xbar.node
   ramB.node := xbar.node
 
-  val fuzz = true
+  val fuzz = false
 
   lazy val module = new Impl
   class Impl extends LazyModuleImp(this) with UnitTestModule {

From 2a8c488d282ebc118bf1476c597dd6e8640d100a Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 23:10:11 -0700
Subject: [PATCH 47/47] tensor: Reassert initiate.ready as soon as access ready

---
 .../radiance/core/TensorCoreDecoupled.scala   | 116 +++++++-----------
 1 file changed, 44 insertions(+), 72 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index ae763c6..c42dc29 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -69,6 +69,11 @@ class TensorCoreDecoupled(
     val source = UInt(sourceWidth.W)
     val data = UInt(dataWidth.W)
   }
+  class TensorMemTag extends Bundle {
+    val warp = UInt(numWarpBits.W)
+    val set = UInt(setBits.W)
+    val index = UInt(indexBits.W)
+  }
   // mem response after translation from TL source to set/step tag
   class TensorMemRespWithTag(
     dataWidth: Int
@@ -77,15 +82,11 @@ class TensorCoreDecoupled(
     val data = UInt(dataWidth.W)
   }
 
-  // FSM
-  // ---
-  // This drives the overall pipeline of memory requests, dot-product unit
-  // operations and regfile writeback.
-
-  val busy = RegInit(false.B)
-  // Holds the warp id the core is currently working on.  Note that we only
-  // support one outstanding warp request
-  val warpAccess = RegInit(0.U(numWarpBits.W))
+  // ===========================================================================
+  // Access stage
+  // ===========================================================================
+  //
+  // Frontend of the decoupled access/execute pipeline.
 
   // sets: k iteration
   val numSets = (tilingParams.k / tilingParams.kc)
@@ -97,39 +98,15 @@ class TensorCoreDecoupled(
   val lastStep = ((1 << stepBits) - 1)
   def setDone(set: UInt) = (set === lastSet.U)
   def stepDone(step: UInt) = (step === lastStep.U)
+  // 'index' is the index of a memory request among the sequence of requests
+  // needed to read a full M-column of A or N-row of B.  Its range is [0,m/2)
+  // or [0,n/2), where 2 is the stride can be read in a single request size.
+  require(tilingParams.m == tilingParams.n,
+          "currently only supports square SMEM tile")
+  val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/
+  val indexBits = log2Ceil(numIndices)
+  val lastIndex = (1 << indexBits) - 1
 
-  when (io.initiate.fire) {
-    val wid = io.initiate.bits.wid
-    busy := true.B
-    warpAccess := wid
-    when(io.writeback.fire) {
-      assert(
-        io.writeback.bits.wid =/= wid,
-        "unsupported concurrent initiate and writeback to the same warp"
-      )
-    }
-  }
-
-  // TODO: @perf: Instead of waiting until the last writeback, release busy as
-  // soon as the access frontend is complete so that there's a better chance to
-  // saturate the backend with back-to-back HGMMAs.  This would require sending
-  // the 'wid' register to backend instead of having it shared with the
-  // frontend.
-  when(io.writeback.fire && io.writeback.bits.last) {
-    busy := false.B
-  }
-
-  // serialize every HGMMA request
-  io.initiate.ready := !busy
-
-  // ===========================================================================
-  // Access stage
-  // ===========================================================================
-  //
-  // Frontend of the decoupled access/execute pipeline.
-
-  // States
-  //
   object AccessorState extends ChiselEnum {
     val idle = Value(0.U)
     val access = Value(1.U)
@@ -142,6 +119,30 @@ class TensorCoreDecoupled(
   val allReqsDone = WireInit(false.B)
   dontTouch(allReqsDone)
 
+  val warpAccess = RegInit(0.U(numWarpBits.W))
+
+  class BlockState extends Bundle {
+    val set = UInt(setBits.W)
+    val index = UInt(indexBits.W)
+  }
+  val stateInit = Wire(new BlockState)
+  stateInit.set := 0.U
+  stateInit.index := 0.U
+  val stateA = RegInit(stateInit)
+  val stateB = RegInit(stateInit)
+  dontTouch(stateA)
+  dontTouch(stateA.index)
+  dontTouch(stateB)
+  dontTouch(stateB.index)
+
+  io.initiate.ready := (state === AccessorState.idle)
+  when (io.initiate.fire) {
+    warpAccess := io.initiate.bits.wid
+    assert(stateA.set === 0.U && stateA.index === 0.U &&
+           stateB.set === 0.U && stateB.index === 0.U,
+           "stateA and stateB not initialized to zero")
+  }
+
   switch(state) {
     is(AccessorState.idle) {
       when(io.initiate.fire) {
@@ -154,40 +155,11 @@ class TensorCoreDecoupled(
       }
     }
     is(AccessorState.finish) {
-      // FIXME: decouple writeback
-      when(io.writeback.fire) {
-        state := AccessorState.idle
-      }
+      // FIXME: is finish state needed?
+      state := AccessorState.idle
     }
   }
 
-  // 'index' is the index of a memory request among the sequence of requests
-  // needed to read a full M-column of A or N-row of B.  Its range is [0,m/2)
-  // or [0,n/2), where 2 is the stride can be read in a single request size.
-  require(tilingParams.m == tilingParams.n,
-          "currently only supports square SMEM tile")
-  val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/
-  val indexBits = log2Ceil(numIndices)
-  val lastIndex = (1 << indexBits) - 1
-
-  class State extends Bundle {
-    val set = UInt(setBits.W)
-    val index = UInt(indexBits.W)
-  }
-  class TensorMemTag extends Bundle {
-    val warp = UInt(numWarpBits.W)
-    val set = UInt(setBits.W)
-    val index = UInt(indexBits.W)
-  }
-
-  val stateInit = Wire(new State)
-  stateInit.set := 0.U
-  stateInit.index := 0.U
-  val stateA = RegInit(stateInit)
-  val stateB = RegInit(stateInit)
-  dontTouch(stateA)
-  dontTouch(stateB)
-
   when (io.reqA.fire) {
     when (stateA.index === lastIndex.U) {
       stateA.set := stateA.set + 1.U