From 12b3b6768776e1b4fc4442929cd9f510c15b5ef6 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 29 Mar 2023 14:02:41 -0700
Subject: [PATCH] Store multiple oldSrcId reqs per lane in a table row

The number of the per-lane reqs is controlled by `numPerLaneReqs`
rather than being set to 2 ** sourceWidth to allow some flexibility.
---
 src/main/scala/tilelink/Coalescing.scala | 113 ++++++++++++++---------
 src/test/scala/CoalescingUnitTest.scala  |  87 ++++++++++-------
 2 files changed, 120 insertions(+), 80 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index cc19ebc..a6f489e 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -62,22 +62,27 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule
       new ShiftQueue(reqQueueEntryT, 4 /* FIXME hardcoded */ )
     )
   }
+
+  // The maximum number of requests from a single lane that can go into a
+  // coalesced request.  Upper bound is 2**sourceWidth.
+  val numPerLaneReqs = 2
+
   val respQueueEntryT = new RespQueueEntry(sourceWidth, wordSize * 8)
   val respQueues = Seq.tabulate(numLanes) { _ =>
-    // Module(
-    //   new ShiftQueue(respQueueEntryT, 8 /* FIXME depth hardcoded */ )
-    // )
     Module(
       new MultiPortQueue(
         respQueueEntryT,
         // enq_lanes = 1 + M, where 1 is the response for the original per-lane
-        // requests that didn't get coalesced, and M is the number of coalescer
-        // nodes.
-        2,
+        // requests that didn't get coalesced, and M is the maximum number of
+        // single-lane requests that can go into a coalesced request.
+        // (`numPerLaneReqs`).
+        1 + numPerLaneReqs,
         // deq_lanes = 1 because we're serializing all responses to 1 port that
         // goes back to the core.
         1,
-        2,
+        // lanes. Has to be at least max(enq_lanes, deq_lanes)
+        1 + numPerLaneReqs,
+        // Depth of each lane queue.
         // XXX queue depth is set to an arbitrarily high value that doesn't
         // make queue block up in the middle of the simulation.  Ideally there
         // should be a more logical way to set this, or we should handle
@@ -210,25 +215,32 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule
   // detail outside to the coalescer
   val offsetBits = 4 // FIXME hardcoded
   val sizeBits = 2 // FIXME hardcoded
-  val newEntry = Wire(new InflightCoalReqTableEntry(numLanes, sourceWidth, offsetBits, sizeBits))
+  val newEntry = Wire(
+    new InflightCoalReqTableEntry(numLanes, numPerLaneReqs, sourceWidth, offsetBits, sizeBits)
+  )
   newEntry.source := coalSourceId
   newEntry.lanes.foreach { l =>
-    l.valid := false.B
     l.reqs.foreach { r =>
       // TODO: this part needs the actual coalescing logic to work
-      r.valid := true.B
+      r.valid := false.B
       r.offset := 1.U
       r.size := 2.U
     }
   }
-  newEntry.lanes(0).valid := true.B
-  newEntry.lanes(2).valid := true.B
+  newEntry.lanes(0).reqs(0).valid := true.B
+  newEntry.lanes(2).reqs(0).valid := true.B
   dontTouch(newEntry)
 
   // Uncoalescer module sncoalesces responses back to each lane
   val coalDataWidth = tlCoal.params.dataBits
   val uncoalescer = Module(
-    new UncoalescingUnit(numLanes, sourceWidth, coalDataWidth, outer.numInflightCoalRequests)
+    new UncoalescingUnit(
+      numLanes,
+      numPerLaneReqs,
+      sourceWidth,
+      coalDataWidth,
+      outer.numInflightCoalRequests
+    )
   )
 
   uncoalescer.io.coalReqValid := coalReqValid
@@ -238,14 +250,16 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule
   uncoalescer.io.coalRespData := tlCoal.d.bits.data
 
   // Queue up uncoalesced responses into each lane's response queue
-  (respQueues zip uncoalescer.io.uncoalResps).foreach { case (q, resp) =>
-    assert(
-      q.io.enq(respQueueCoalPortOffset).ready,
-      s"respQueue: enq port for 0-th coalesced response is blocked"
-    )
-    q.io.enq(respQueueCoalPortOffset).valid := resp.valid
-    q.io.enq(respQueueCoalPortOffset).bits := resp.bits
-  // dontTouch(q.io.enq(respQueueCoalPortOffset))
+  (respQueues zip uncoalescer.io.uncoalResps).foreach { case (q, lanes) =>
+    lanes.zipWithIndex.foreach { case (resp, i) =>
+      assert(
+        q.io.enq(respQueueCoalPortOffset + i).ready,
+        s"respQueue: enq port for 0-th coalesced response is blocked"
+      )
+      q.io.enq(respQueueCoalPortOffset + i).valid := resp.valid
+      q.io.enq(respQueueCoalPortOffset + i).bits := resp.bits
+      // dontTouch(q.io.enq(respQueueCoalPortOffset))
+    }
   }
 
   // Debug
@@ -260,12 +274,13 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule
 
 class UncoalescingUnit(
     val numLanes: Int,
+    val numPerLaneReqs: Int,
     val sourceWidth: Int,
     val coalDataWidth: Int,
     val numInflightCoalRequests: Int
 ) extends Module {
   val inflightTable = Module(
-    new InflightCoalReqTable(numLanes, sourceWidth, numInflightCoalRequests)
+    new InflightCoalReqTable(numLanes, numPerLaneReqs, sourceWidth, numInflightCoalRequests)
   )
   val wordSize = 4 // FIXME duplicate
 
@@ -275,7 +290,9 @@ class UncoalescingUnit(
     val coalRespValid = Input(Bool())
     val coalRespSrcId = Input(UInt(sourceWidth.W))
     val coalRespData = Input(UInt(coalDataWidth.W))
-    val uncoalResps = Output(Vec(numLanes, ValidIO(new RespQueueEntry(sourceWidth, wordSize * 8))))
+    val uncoalResps = Output(
+      Vec(numLanes, Vec(numPerLaneReqs, ValidIO(new RespQueueEntry(sourceWidth, wordSize * 8))))
+    )
   })
 
   // Populate inflight table
@@ -311,25 +328,26 @@ class UncoalescingUnit(
 
   // Un-coalesce responses back to individual lanes
   val found = inflightTable.io.lookup.bits
-  found.lanes.zipWithIndex.foreach { case (l, i) =>
-    // FIXME: only looking at 0th srcId entry
+  (found.lanes zip io.uncoalResps).foreach { case (lane, ioLane) =>
+    lane.reqs.zipWithIndex.foreach { case (req, i) =>
+      val ioReq = ioLane(i)
 
-    val uncoalResp = io.uncoalResps(i)
-    uncoalResp.valid := false.B
-    uncoalResp.bits := DontCare
+      // FIXME: only looking at 0th srcId entry
 
-    when(inflightTable.io.lookup.valid) {
-      uncoalResp.valid := l.valid
-      uncoalResp.bits.source := 0.U
+      ioReq.valid := false.B
+      ioReq.bits := DontCare
 
-      // FIXME: disregard size enum for now
-      val byteSize = 4
-      uncoalResp.bits.data :=
-        getCoalescedDataChunk(io.coalRespData, coalDataWidth, l.reqs(0).offset, byteSize)
-    }
+      when(inflightTable.io.lookup.valid) {
+        ioReq.valid := req.valid
+        ioReq.bits.source := 0.U
 
-    when(l.valid) {
-      when(l.reqs(0).valid) {
+        // FIXME: disregard size enum for now
+        val byteSize = 4
+        ioReq.bits.data :=
+          getCoalescedDataChunk(io.coalRespData, coalDataWidth, req.offset, byteSize)
+      }
+
+      when(req.valid) {
         printf(s"lane ${i} req 0 is valid!\n")
       }
     }
@@ -341,12 +359,16 @@ class UncoalescingUnit(
 // from, what their original TileLink sourceId were, etc.  We use this info to
 // split the coalesced response back to individual per-lane responses with the
 // right metadata.
-class InflightCoalReqTable(val numLanes: Int, val sourceWidth: Int, val entries: Int)
-    extends Module {
+class InflightCoalReqTable(
+    val numLanes: Int,
+    val numPerLaneReqs: Int,
+    val sourceWidth: Int,
+    val entries: Int
+) extends Module {
   val offsetBits = 4 // FIXME hardcoded
   val sizeBits = 2 // FIXME hardcoded
   val entryT =
-    new InflightCoalReqTableEntry(numLanes, sourceWidth, offsetBits, sizeBits)
+    new InflightCoalReqTableEntry(numLanes, numPerLaneReqs, sourceWidth, offsetBits, sizeBits)
 
   val io = IO(new Bundle {
     val enq = Flipped(Decoupled(entryT))
@@ -362,6 +384,7 @@ class InflightCoalReqTable(val numLanes: Int, val sourceWidth: Int, val entries:
       val valid = Bool()
       val bits = new InflightCoalReqTableEntry(
         numLanes,
+        numPerLaneReqs,
         sourceWidth,
         offsetBits,
         sizeBits
@@ -373,7 +396,6 @@ class InflightCoalReqTable(val numLanes: Int, val sourceWidth: Int, val entries:
     (0 until entries).foreach { i =>
       table(i).valid := false.B
       table(i).bits.lanes.foreach { l =>
-        l.valid := false.B
         l.reqs.foreach { r =>
           r.offset := 0.U
           r.size := 0.U
@@ -422,6 +444,8 @@ class InflightCoalReqTable(val numLanes: Int, val sourceWidth: Int, val entries:
 
 class InflightCoalReqTableEntry(
     val numLanes: Int,
+    // Maximum number of requests from a single lane that can get coalesced into a single request
+    val numPerLaneReqs: Int,
     val sourceWidth: Int,
     val offsetBits: Int,
     val sizeBits: Int
@@ -432,9 +456,8 @@ class InflightCoalReqTableEntry(
     val size = UInt(sizeBits.W)
   }
   class PerLane extends Bundle {
-    val valid = Bool()
-    // srcId is positionally encoded
-    val reqs = Vec(1 << sourceWidth, new CoreReq)
+    // FIXME: if numPerLaneReqs != 2 ** sourceWidth, we need to store srcId as well
+    val reqs = Vec(numPerLaneReqs, new CoreReq)
   }
   // sourceId of the coalesced response that just came back.  This will be the
   // key that queries the table.
diff --git a/src/test/scala/CoalescingUnitTest.scala b/src/test/scala/CoalescingUnitTest.scala
index f4fe253..b65110a 100644
--- a/src/test/scala/CoalescingUnitTest.scala
+++ b/src/test/scala/CoalescingUnitTest.scala
@@ -32,58 +32,75 @@ class MultiPortQueueUnitTest extends AnyFlatSpec with ChiselScalatestTester {
 class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   behavior of "uncoalescer"
   val numLanes = 4
+  val numPerLaneReqs = 2
   val sourceWidth = 2
   // 16B coalescing size
   val coalDataWidth = 128
   val numInflightCoalRequests = 4
 
   it should "work" in {
-    test(new UncoalescingUnit(numLanes, sourceWidth, coalDataWidth, numInflightCoalRequests))
-      // vcs helps with simulation time, but sometimes errors with
-      // "mutation occurred during iteration" java error
-      // .withAnnotations(Seq(VcsBackendAnnotation))
-      { c =>
-        val sourceId = 0.U
-        c.io.coalReqValid.poke(true.B)
-        c.io.newEntry.source.poke(sourceId)
-        c.io.newEntry.lanes.foreach { l => l.valid.poke(false.B) }
-        c.io.newEntry.lanes(0).valid.poke(true.B)
-        c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-        c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-        c.io.newEntry.lanes(0).reqs(0).size.poke(2.U)
-        c.io.newEntry.lanes(2).valid.poke(true.B)
-        c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-        c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U)
-        c.io.newEntry.lanes(2).reqs(0).size.poke(1.U)
+    test(
+      new UncoalescingUnit(
+        numLanes,
+        numPerLaneReqs,
+        sourceWidth,
+        coalDataWidth,
+        numInflightCoalRequests
+      )
+    )
+    // vcs helps with simulation time, but sometimes errors with
+    // "mutation occurred during iteration" java error
+    // .withAnnotations(Seq(VcsBackendAnnotation))
+    { c =>
+      val sourceId = 0.U
+      c.io.coalReqValid.poke(true.B)
+      c.io.newEntry.source.poke(sourceId)
+      c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
+      c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
+      c.io.newEntry.lanes(0).reqs(0).size.poke(2.U)
+      c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B)
+      c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U)
+      c.io.newEntry.lanes(0).reqs(1).size.poke(2.U)
+      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
+      c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U)
+      c.io.newEntry.lanes(2).reqs(0).size.poke(1.U)
+      c.io.newEntry.lanes(2).reqs(1).valid.poke(true.B)
+      c.io.newEntry.lanes(2).reqs(1).offset.poke(0.U)
+      c.io.newEntry.lanes(2).reqs(1).size.poke(2.U)
 
-        c.clock.step()
+      c.clock.step()
 
-        c.io.coalReqValid.poke(false.B)
+      c.io.coalReqValid.poke(false.B)
 
-        c.clock.step()
+      c.clock.step()
 
-        c.io.coalRespValid.poke(true.B)
-        c.io.coalRespSrcId.poke(sourceId)
-        val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
-        c.io.coalRespData.poke(lit.U)
+      c.io.coalRespValid.poke(true.B)
+      c.io.coalRespSrcId.poke(sourceId)
+      val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
+      c.io.coalRespData.poke(lit.U)
 
-        // table lookup is combinational at the same cycle
-        c.io.uncoalResps(0).valid.expect(true.B)
-        c.io.uncoalResps(1).valid.expect(false.B)
-        c.io.uncoalResps(2).valid.expect(true.B)
-        c.io.uncoalResps(3).valid.expect(false.B)
+      // table lookup is combinational at the same cycle
+      c.io.uncoalResps(0)(0).valid.expect(true.B)
+      c.io.uncoalResps(1)(0).valid.expect(false.B)
+      c.io.uncoalResps(2)(0).valid.expect(true.B)
+      c.io.uncoalResps(3)(0).valid.expect(false.B)
 
-        c.io.uncoalResps(0).bits.data.expect(0x89abcdefL.U)
-        c.io.uncoalResps(0).bits.source.expect(0.U)
-        c.io.uncoalResps(2).bits.data.expect(0x5ca1ab1eL.U)
-        c.io.uncoalResps(2).bits.source.expect(0.U)
-      }
+      c.io.uncoalResps(0)(0).bits.data.expect(0x89abcdefL.U)
+      c.io.uncoalResps(0)(0).bits.source.expect(0.U)
+      c.io.uncoalResps(0)(1).bits.data.expect(0x89abcdefL.U)
+      c.io.uncoalResps(0)(1).bits.source.expect(0.U)
+      c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(2)(0).bits.source.expect(0.U)
+      c.io.uncoalResps(2)(1).bits.data.expect(0x01234567L.U)
+      c.io.uncoalResps(2)(1).bits.source.expect(0.U)
+    }
   }
 }
 
 class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   behavior of "inflight coalesced request table"
   val numLanes = 4
+  val numPerLaneReqs = 2
   val sourceWidth = 2
   val entries = 4
 
@@ -91,7 +108,7 @@ class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   val sizeBits = 2
 
   val inflightCoalReqTableEntry =
-    new InflightCoalReqTableEntry(numLanes, sourceWidth, offsetBits, sizeBits)
+    new InflightCoalReqTableEntry(numLanes, numPerLaneReqs, sourceWidth, offsetBits, sizeBits)
 
   // it should "stop enqueueing when full" in {
   //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries)) { c =>