From 630d76461c3ac32f229c501c3a92a797119fee3e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 16 Oct 2023 01:12:33 -0700
Subject: [PATCH] Do proper TL sourceId allocation for Vortex dmem requests

This fixes sourceId collision that occurs when naively re-using tag bit
of a Vortex dmem request as TL source, which happens because Vortex core
does not allocate a new LSU entry for writes.

`VortexSourceGen` module acts as a Vortax tag <-> new TL source ID
converter, where it allocates a new ID for every new Vortex request, and
restores its original tag bits from the metadata embedded in the
SourceGenerator module.

TODO:
- Decouple sourceWidth of downstream TL nodes from Vortex's tag bit
  width; they are set to be the same for convenience as of now
- Apply this to imem requests as well
---
 src/main/scala/tile/VortexTile.scala | 116 ++++++++++++++++++++-------
 1 file changed, 88 insertions(+), 28 deletions(-)

diff --git a/src/main/scala/tile/VortexTile.scala b/src/main/scala/tile/VortexTile.scala
index 6c900fd..3b81e77 100644
--- a/src/main/scala/tile/VortexTile.scala
+++ b/src/main/scala/tile/VortexTile.scala
@@ -4,7 +4,7 @@
 package tile
 
 import chisel3._
-import chisel3.util.RRArbiter
+import chisel3.util._
 import org.chipsalliance.cde.config._
 import freechips.rocketchip.devices.tilelink._
 import freechips.rocketchip.diplomacy._
@@ -90,6 +90,8 @@ class VortexTile private (
     beatBytes = lazyCoreParamsView.coreDataBytes,
     minLatency = 1)))*/
 
+  val numLanes = 4 // FIXME: hardcoded
+
   val imemNodes = Seq.tabulate(1) { i =>
     TLClientNode(
       Seq(
@@ -109,7 +111,7 @@ class VortexTile private (
     )
   }
 
-  val dmemNodes = Seq.tabulate(4) { i =>
+  val dmemNodes = Seq.tabulate(numLanes) { i =>
     TLClientNode(
       Seq(
         TLMasterPortParameters.v1(
@@ -289,38 +291,57 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
   } else {
     (core.io.imem.get zip outer.imemNodes).foreach { case (coreMem, tileNode) =>
       coreMem.d <> tileNode.out.head._1.d
-      coreMem.a <> tileNode.out.head._1.a
+      tileNode.out.head._1.a <> coreMem.a
     }
 
-    // pick source id and:
+    // Since the individual per-lane TL requests might come back out-of-sync between
+    // the lanes, but Vortex core expects the lane requests to be synced,
+    // we need to selectively fire responses that have the same source, and
+    // delay others.  Below is the logic that implements this.
+
+    // choose one source out of the arriving per-lane TL D channels
+    val arb = Module(
+      new RRArbiter(core.io.dmem.get.head.d.bits.source.cloneType, outer.numLanes)
+    )
+    val dmemTLBundles = outer.dmemNodes.map(_.out.head._1)
+    arb.io.out.ready := true.B
+    (arb.io.in zip dmemTLBundles).foreach { case (arbIn, tlBundle) =>
+      arbIn.valid := tlBundle.d.valid
+      arbIn.bits := tlBundle.d.bits.source
+    }
+    val matchingSources = Wire(UInt(outer.numLanes.W))
+    matchingSources := dmemTLBundles
+      .map(b => (b.d.bits.source === arb.io.out.bits) && arb.io.out.valid)
+      .asUInt
+
+    // connection: VortexBundle <--> sourceGen <--> dmemNodes
+    val sourceGens = Seq.tabulate(outer.numLanes) { _ =>
+      Module(new VortexSourceGen(
+        2, // FIXME: hardcoded
+        dmemTLBundles.head.a.bits,
+        dmemTLBundles.head.d.bits,
+      ))
+    }
+    (core.io.dmem.get zip sourceGens) foreach { case (coreMem, sourceGen) =>
+      sourceGen.io.inReq <> coreMem.a
+      coreMem.d <> sourceGen.io.inResp
+    }
+    (sourceGens zip dmemTLBundles) foreach { case (sourceGen, tlBundle) =>
+      tlBundle.a <> sourceGen.io.outReq
+    }
+    // using the chosen source id,
     // - lie to core that response is not valid if source doesn't match picked
     // - lie to downstream that core is not ready if source doesn't match picked
-
-    val arb = Module(
-      new RRArbiter(core.io.dmem.get.head.d.bits.source.cloneType, 4)
-    )
-    val matchingSources = Wire(UInt(4.W))
-    val dmemDs = outer.dmemNodes.map(_.out.head._1.d)
-
-    (arb.io.in zip dmemDs).zipWithIndex.foreach { case ((arbIn, tileNode), i) =>
-      arbIn.valid := tileNode.valid
-      arbIn.bits := tileNode.bits.source
-    }
-    matchingSources := dmemDs
-      .map(d => (d.bits.source === arb.io.out.bits) && arb.io.out.valid)
-      .asUInt
-    arb.io.out.ready := true.B
-
-    (core.io.dmem.get zip dmemDs).zipWithIndex.foreach {
-      case ((coreMem, tileNode), i) =>
-        coreMem.d.bits := tileNode.bits
-        coreMem.d.valid := tileNode.valid && matchingSources(i)
-        tileNode.ready := coreMem.d.ready && matchingSources(i)
+    (sourceGens zip dmemTLBundles).zipWithIndex.foreach {
+      case ((sourceGen, tlBundle), i) =>
+        sourceGen.io.outResp.bits := tlBundle.d.bits
+        sourceGen.io.outResp.valid := tlBundle.d.valid && matchingSources(i)
+        tlBundle.d.ready := sourceGen.io.outResp.ready && matchingSources(i)
     }
 
-    (core.io.dmem.get zip outer.dmemNodes).foreach { case (coreMem, tileNode) =>
-      coreMem.a <> tileNode.out.head._1.a
-    }
+    // (core.io.dmem.get zip outer.dmemNodes).foreach { case (coreMem, tileNode) =>
+    //   tileNode.out.head._1.a <> coreMem.a
+    // }
   }
 
   // core.io.fpu := DontCare
@@ -335,6 +356,45 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
   // dcacheArb.io.requestor <> dcachePorts.toSeq
 }
 
+// TODO: Currently in/out are assumed to be the same TL bundle with the same
+// sourceWidth; this needs to be more flexible.
+//
+// Some @copypaste from CoalescerSourceGen.
+class VortexSourceGen(
+  newSourceWidth: Int,
+  reqT: TLBundleA,
+  respT: TLBundleD
+) extends Module {
+  val io = IO(new Bundle {
+    // in/out means upstream/downstream
+    val inReq = Flipped(Decoupled(reqT.cloneType))
+    val outReq = Decoupled(reqT.cloneType)
+    val inResp = Decoupled(respT.cloneType)
+    val outResp = Flipped(Decoupled(respT.cloneType))
+  })
+  val sourceGen = Module(new SourceGenerator(
+    newSourceWidth,
+    Some(chiselTypeOf(reqT.source)),
+    ignoreInUse = false
+  ))
+  sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created
+  sourceGen.io.reclaim.valid := io.outResp.fire
+  sourceGen.io.reclaim.bits := io.outResp.bits.source
+  sourceGen.io.meta := io.inReq.bits.source
+
+  // passthrough logic
+  io.outReq <> io.inReq
+  // "man-in-the-middle"
+  io.inReq.ready := io.outReq.ready && sourceGen.io.id.valid
+  io.outReq.valid := io.inReq.valid && sourceGen.io.id.valid
+  // FIXME: Fill is a hack; just change downstream to the right sourceWidth
+  // io.outReq.bits.source := Fill(newSourceWidth, sourceGen.io.id.bits)
+  io.outReq.bits.source := sourceGen.io.id.bits
+  io.inResp <> io.outResp
+  // translate upstream response back to its old sourceId
+  io.inResp.bits.source := sourceGen.io.peek
+}
+
 // FIXME: unsure this is necessary
 trait HasFpuOpt { this: RocketTileModuleImp =>
   val fpuOpt =