From 5fed3ef823d0a8cd04005fd0d098ba313760ed8a Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 15:42:23 -0700
Subject: [PATCH 01/10] Generalize Req/RespQueueEntry into Response/Request
 bundle

---
 src/main/scala/tilelink/Coalescing.scala | 76 ++++++++++--------------
 1 file changed, 32 insertions(+), 44 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index d63e9cf..7835210 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -135,7 +135,10 @@ class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends La
   lazy val module = new CoalescingUnitImp(this, config)
 }
 
-class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int) extends Bundle {
+// Protocol-agnostic bundles that represent a request and a response to the
+// coalescer.
+
+class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int) extends Bundle {
   require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
   val op = UInt(1.W) // 0=READ 1=WRITE
   val address = UInt(addressWidth.W)
@@ -163,7 +166,7 @@ class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWid
   }
 }
 
-class RespQueueEntry(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle {
+class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle {
   val op = UInt(1.W) // 0=READ 1=WRITE
   val size = UInt(sizeWidth.W)
   val source = UInt(sourceWidth.W)
@@ -192,6 +195,15 @@ class RespQueueEntry(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends B
   }
 }
 
+class NonCoalescedResponse(config: CoalescerConfig)
+extends Response(sourceWidth = log2Ceil(config.numOldSrcIds),
+  sizeWidth = config.wordSizeWidth,
+  dataWidth = config.wordSizeInBytes * 8)
+class CoalescedResponse(config: CoalescerConfig)
+extends Response(sourceWidth = log2Ceil(config.numNewSrcIds),
+  sizeWidth = log2Ceil(config.maxCoalLogSize),
+  dataWidth = (8 * (1 << config.maxCoalLogSize)))
+
 // If `ignoreInUse`, just keep giving out new IDs without checking if it is in
 // use.
 class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module {
@@ -340,7 +352,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 }
 
 // Software model: coalescer.py
-class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
+class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[Request],
                     config: CoalescerConfig) extends Module {
   val io = IO(new Bundle {
     val window = Input(windowT.io.cloneType)
@@ -376,7 +388,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
 
   val size = coalLogSize
   val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
-  def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = {
+  def canMatch(req0: Request, req0v: Bool, req1: Request, req1v: Bool): Bool = {
     (req0.op === req1.op) &&
     (req0v && req1v) &&
     ((req0.address & this.addrMask) === (req1.address & this.addrMask))
@@ -471,7 +483,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
 // coalesced request out of all possible combinations.
 //
 // Software model: coalescer.py
-class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry,
+class MultiCoalescer(windowT: CoalShiftQueue[Request], coalReqT: Request,
                      config: CoalescerConfig) extends Module {
   val io = IO(new Bundle {
     // coalescing window, connected to the contents of the request queues
@@ -612,11 +624,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
 
   val sourceWidth = outer.cpuNode.in.head._1.params.sourceBits
   // note we are using word size. assuming all coalescer inputs are word sized
-  val reqQueueEntryT = new ReqQueueEntry(sourceWidth, config.wordSizeWidth,
+  val reqQueueEntryT = new Request(sourceWidth, config.wordSizeWidth,
     config.addressWidth, (config.wordSizeInBytes * 8))
   val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config))
 
-  val coalReqT = new ReqQueueEntry(log2Ceil(config.numNewSrcIds), log2Ceil(config.maxCoalLogSize),
+  val coalReqT = new Request(log2Ceil(config.numNewSrcIds), log2Ceil(config.maxCoalLogSize),
     config.addressWidth, (1 << config.maxCoalLogSize) * 8)
   val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
   coalescer.io.window := reqQueues.io
@@ -703,7 +715,8 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   // coalesced request.  Upper bound is min(DEPTH, 2**sourceWidth).
   val numPerLaneReqs = config.queueDepth
 
-  val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize),
+  // FIXME: no need to contain maxCoalLogSize data
+  val respQueueEntryT = new Response(sourceWidth, log2Ceil(config.maxCoalLogSize),
     (1 << config.maxCoalLogSize) * 8)
   val respQueues = Seq.tabulate(config.numLanes) { _ =>
     Module(
@@ -821,8 +834,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   uncoalescer.io.newEntry := newEntry
   // Cleanup: custom <>?
   uncoalescer.io.coalResp.valid := tlCoal.d.valid
-  uncoalescer.io.coalResp.bits.source := tlCoal.d.bits.source
-  uncoalescer.io.coalResp.bits.data := tlCoal.d.bits.data
+  uncoalescer.io.coalResp.bits.fromTLD(tlCoal.d.bits)
   tlCoal.d.ready := uncoalescer.io.coalResp.ready
 
   // Connect uncoalescer results back into each lane's response queue
@@ -853,24 +865,6 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   dontTouch(tlCoal.d)
 }
 
-// Protocol-agnostic bundle that represents a coalesced response.
-//
-// Having this makes it easier to:
-//   * do unit tests -- no need to deal with TileLink in the chiseltest code
-//   * adapt coalescer to custom protocols like a custom L1 cache interface.
-//
-// FIXME: overlaps with RespQueueEntry. Trait-ify
-class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle {
-  val source = UInt(log2Ceil(config.numNewSrcIds).W)
-  val data = UInt((8 * (1 << config.maxCoalLogSize)).W)
-
-  def fromTLD(bundle:TLBundleD): Unit = {
-    this.source := bundle.source
-    this.data   := bundle.data
-  }
-
-}
-
 class Uncoalescer(config: CoalescerConfig) extends Module {
   // notes to hansung:
   //  val numLanes: Int, <-> config.NUM_LANES
@@ -884,15 +878,14 @@ class Uncoalescer(config: CoalescerConfig) extends Module {
     val coalReqValid = Input(Bool())
     // FIXME: receive ReqQueueEntry and construct newEntry inside uncoalescer
     val newEntry = Input(inflightTable.entryT.cloneType)
-    val coalResp = Flipped(Decoupled(new CoalescedResponseBundle(config)))
+    val coalResp = Flipped(Decoupled(new CoalescedResponse(config)))
     val uncoalResps = Output(
       Vec(
         config.numLanes,
         Vec(
           config.queueDepth,
           ValidIO(
-            new RespQueueEntry(log2Ceil(config.numOldSrcIds), config.wordSizeWidth,
-              config.wordSizeInBytes * 8)
+            new NonCoalescedResponse(config)
           )
         )
       )
@@ -1853,25 +1846,20 @@ class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends La
     val node = TLIdentityNode()
     node :=* outputXbar.node
 
-    val nonCoalEntryT = new ReqQueueEntry(
+    val nonCoalEntryT = new Request(
                                 log2Ceil(config.numOldSrcIds),
                                 config.wordSizeWidth,
                                 config.addressWidth,
                                 config.wordSizeInBytes * 8
                               )
-    val coalEntryT    = new ReqQueueEntry(
+    val coalEntryT    = new Request(
                                 log2Ceil(config.numOldSrcIds),
                                 log2Ceil(config.maxCoalLogSize),
                                 config.addressWidth,
                                 (1 << config.maxCoalLogSize) * 8
                               )
-    val respNonCoalEntryT = new RespQueueEntry(
-                                log2Ceil(config.numOldSrcIds),
-                                config.wordSizeWidth,
-                                config.wordSizeInBytes * 8
-                              )
-
-    val respCoalBundleT   = new CoalescedResponseBundle(config)
+    val respNonCoalEntryT = new NonCoalescedResponse(config)
+    val respCoalBundleT   = new CoalescedResponse(config)
     
 
     lazy val module = new CoalescerXbarImpl(
@@ -1883,10 +1871,10 @@ class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends La
 
 class CoalescerXbarImpl(outer: CoalescerXbar, 
                       config: CoalescerConfig,
-                      nonCoalEntryT: ReqQueueEntry, 
-                      coalEntryT: ReqQueueEntry,
-                      respNonCoalEntryT: RespQueueEntry, 
-                      respCoalBundleT: CoalescedResponseBundle
+                      nonCoalEntryT: Request, 
+                      coalEntryT: Request,
+                      respNonCoalEntryT: Response, 
+                      respCoalBundleT: CoalescedResponse
       ) extends LazyModuleImp(outer){
 
 

From 406f90b6332dde2351f95386692410134a072064 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 15:50:58 -0700
Subject: [PATCH 02/10] De-duplicate equivalent Request bundles using
 NonCoal/Coal variants

---
 src/main/scala/tilelink/Coalescing.scala      | 40 +++++++++----------
 .../scala/coalescing/CoalescingUnitTest.scala |  6 +--
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 7835210..0345c8b 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -165,6 +165,16 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In
     bits
   }
 }
+class NonCoalescedRequest(config: CoalescerConfig)
+extends Request(sourceWidth = log2Ceil(config.numOldSrcIds),
+  sizeWidth = config.wordSizeWidth,
+  addressWidth = config.addressWidth,
+  dataWidth = config.wordSizeInBytes * 8)
+class CoalescedRequest(config: CoalescerConfig)
+extends Request(sourceWidth = log2Ceil(config.numNewSrcIds),
+  sizeWidth = log2Ceil(config.maxCoalLogSize),
+  addressWidth = config.addressWidth,
+  dataWidth = (8 * (1 << config.maxCoalLogSize)))
 
 class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle {
   val op = UInt(1.W) // 0=READ 1=WRITE
@@ -194,7 +204,6 @@ class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle
     this.error := bundle.denied
   }
 }
-
 class NonCoalescedResponse(config: CoalescerConfig)
 extends Response(sourceWidth = log2Ceil(config.numOldSrcIds),
   sizeWidth = config.wordSizeWidth,
@@ -352,7 +361,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 }
 
 // Software model: coalescer.py
-class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[Request],
+class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedRequest],
                     config: CoalescerConfig) extends Module {
   val io = IO(new Bundle {
     val window = Input(windowT.io.cloneType)
@@ -483,7 +492,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[Request],
 // coalesced request out of all possible combinations.
 //
 // Software model: coalescer.py
-class MultiCoalescer(windowT: CoalShiftQueue[Request], coalReqT: Request,
+class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Request,
                      config: CoalescerConfig) extends Module {
   val io = IO(new Bundle {
     // coalescing window, connected to the contents of the request queues
@@ -622,14 +631,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
     s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
     s"mismatch with config.addressWidth (${config.addressWidth})")
 
-  val sourceWidth = outer.cpuNode.in.head._1.params.sourceBits
+  val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
   // note we are using word size. assuming all coalescer inputs are word sized
-  val reqQueueEntryT = new Request(sourceWidth, config.wordSizeWidth,
-    config.addressWidth, (config.wordSizeInBytes * 8))
+  val reqQueueEntryT = new NonCoalescedRequest(config)
   val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config))
 
-  val coalReqT = new Request(log2Ceil(config.numNewSrcIds), log2Ceil(config.maxCoalLogSize),
-    config.addressWidth, (1 << config.maxCoalLogSize) * 8)
+  val coalReqT = new CoalescedRequest(config)
   val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
   coalescer.io.window := reqQueues.io
   reqQueues.io.coalescable := coalescer.io.coalescable
@@ -716,7 +723,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   val numPerLaneReqs = config.queueDepth
 
   // FIXME: no need to contain maxCoalLogSize data
-  val respQueueEntryT = new Response(sourceWidth, log2Ceil(config.maxCoalLogSize),
+  val respQueueEntryT = new Response(oldSourceWidth, log2Ceil(config.maxCoalLogSize),
     (1 << config.maxCoalLogSize) * 8)
   val respQueues = Seq.tabulate(config.numLanes) { _ =>
     Module(
@@ -1846,21 +1853,10 @@ class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends La
     val node = TLIdentityNode()
     node :=* outputXbar.node
 
-    val nonCoalEntryT = new Request(
-                                log2Ceil(config.numOldSrcIds),
-                                config.wordSizeWidth,
-                                config.addressWidth,
-                                config.wordSizeInBytes * 8
-                              )
-    val coalEntryT    = new Request(
-                                log2Ceil(config.numOldSrcIds),
-                                log2Ceil(config.maxCoalLogSize),
-                                config.addressWidth,
-                                (1 << config.maxCoalLogSize) * 8
-                              )
+    val nonCoalEntryT = new NonCoalescedRequest(config)
+    val coalEntryT    = new CoalescedRequest(config)
     val respNonCoalEntryT = new NonCoalescedResponse(config)
     val respCoalBundleT   = new CoalescedResponse(config)
-    
 
     lazy val module = new CoalescerXbarImpl(
       this, config, nonCoalEntryT, coalEntryT, respNonCoalEntryT, respCoalBundleT)
diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala
index 1a3ceee..29ea8dc 100644
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -127,12 +127,12 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI
   )
 
   val reqQueueEnqReady =  peekIn(0).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType))
-  val reqQueueEnqBits =   peekIn(1).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(x.cloneType))
+  val reqQueueEnqBits =   peekIn(1).asInstanceOf[Seq[Request]].map(x => IO(x.cloneType))
   val reqQueueEnqValid =  peekIn(2).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType))
-  val reqQueueDeqBits =   peekIn(3).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(Output(x.cloneType)))
+  val reqQueueDeqBits =   peekIn(3).asInstanceOf[Seq[Request]].map(x => IO(Output(x.cloneType)))
   val reqQueueDeqValid =  peekIn(4).asInstanceOf[Seq[Bool]].map(x => IO(Output(x.cloneType)))
   val coalReqReady =      IO(Output(peekIn(5).asInstanceOf[Bool].cloneType))
-  val coalReqBits =       IO(Output(peekIn(6).asInstanceOf[ReqQueueEntry].cloneType))
+  val coalReqBits =       IO(Output(peekIn(6).asInstanceOf[Request].cloneType))
   val coalReqValid =      IO(Output(peekIn(7).asInstanceOf[Bool].cloneType))
   val coalInvalidate =    IO(Output(peekIn(8).asInstanceOf[Valid[Vec[UInt]]].cloneType))
 

From 7fa6be4a8bc12ada8ee03fe7a4fe025c8709f860 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 15:56:30 -0700
Subject: [PATCH 03/10] Use case class for noncoal/coal bundles

don't know what they really do, but they look fancy
---
 src/main/scala/tilelink/Coalescing.scala | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 0345c8b..373337a 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -165,12 +165,12 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In
     bits
   }
 }
-class NonCoalescedRequest(config: CoalescerConfig)
+case class NonCoalescedRequest(config: CoalescerConfig)
 extends Request(sourceWidth = log2Ceil(config.numOldSrcIds),
   sizeWidth = config.wordSizeWidth,
   addressWidth = config.addressWidth,
   dataWidth = config.wordSizeInBytes * 8)
-class CoalescedRequest(config: CoalescerConfig)
+case class CoalescedRequest(config: CoalescerConfig)
 extends Request(sourceWidth = log2Ceil(config.numNewSrcIds),
   sizeWidth = log2Ceil(config.maxCoalLogSize),
   addressWidth = config.addressWidth,
@@ -204,11 +204,11 @@ class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle
     this.error := bundle.denied
   }
 }
-class NonCoalescedResponse(config: CoalescerConfig)
+case class NonCoalescedResponse(config: CoalescerConfig)
 extends Response(sourceWidth = log2Ceil(config.numOldSrcIds),
   sizeWidth = config.wordSizeWidth,
   dataWidth = config.wordSizeInBytes * 8)
-class CoalescedResponse(config: CoalescerConfig)
+case class CoalescedResponse(config: CoalescerConfig)
 extends Response(sourceWidth = log2Ceil(config.numNewSrcIds),
   sizeWidth = log2Ceil(config.maxCoalLogSize),
   dataWidth = (8 * (1 << config.maxCoalLogSize)))

From 0c8909cb43d3d48a9a2f59736e943831e914982b Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 16:11:39 -0700
Subject: [PATCH 04/10] scalafmt

---
 src/main/scala/tilelink/Coalescing.scala | 480 ++++++++++++++---------
 1 file changed, 302 insertions(+), 178 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 373337a..0e72dae 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -152,7 +152,7 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In
       fromSource = this.source,
       toAddress = this.address,
       lgSize = this.size,
-      data = this.data,
+      data = this.data
     )
     val (glegal, gbits) = edgeOut.Get(
       fromSource = this.source,
@@ -166,17 +166,22 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In
   }
 }
 case class NonCoalescedRequest(config: CoalescerConfig)
-extends Request(sourceWidth = log2Ceil(config.numOldSrcIds),
-  sizeWidth = config.wordSizeWidth,
-  addressWidth = config.addressWidth,
-  dataWidth = config.wordSizeInBytes * 8)
+    extends Request(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      addressWidth = config.addressWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
 case class CoalescedRequest(config: CoalescerConfig)
-extends Request(sourceWidth = log2Ceil(config.numNewSrcIds),
-  sizeWidth = log2Ceil(config.maxCoalLogSize),
-  addressWidth = config.addressWidth,
-  dataWidth = (8 * (1 << config.maxCoalLogSize)))
+    extends Request(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      addressWidth = config.addressWidth,
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )
 
-class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle {
+class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int)
+    extends Bundle {
   val op = UInt(1.W) // 0=READ 1=WRITE
   val size = UInt(sizeWidth.W)
   val source = UInt(sourceWidth.W)
@@ -205,17 +210,22 @@ class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle
   }
 }
 case class NonCoalescedResponse(config: CoalescerConfig)
-extends Response(sourceWidth = log2Ceil(config.numOldSrcIds),
-  sizeWidth = config.wordSizeWidth,
-  dataWidth = config.wordSizeInBytes * 8)
+    extends Response(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
 case class CoalescedResponse(config: CoalescerConfig)
-extends Response(sourceWidth = log2Ceil(config.numNewSrcIds),
-  sizeWidth = log2Ceil(config.maxCoalLogSize),
-  dataWidth = (8 * (1 << config.maxCoalLogSize)))
+    extends Response(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )
 
 // If `ignoreInUse`, just keep giving out new IDs without checking if it is in
 // use.
-class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module {
+class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true)
+    extends Module {
   val io = IO(new Bundle {
     val gen = Input(Bool())
     val reclaim = Input(Valid(UInt(sourceWidth.W)))
@@ -234,15 +244,16 @@ class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) e
 
   io.id.valid := (if (ignoreInUse) true.B else !occupancyTable(head).valid)
   io.id.bits := head
-  when (io.gen && io.id.valid /* fire */) {
+  when(io.gen && io.id.valid /* fire */ ) {
     occupancyTable(io.id.bits).valid := true.B // mark in use
   }
-  when (io.reclaim.valid) {
+  when(io.reclaim.valid) {
     occupancyTable(io.reclaim.bits).valid := false.B // mark freed
   }
 }
 
-class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) extends Module {
+class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
+    extends Module {
   val io = IO(new Bundle {
     val queue = new Bundle {
       val enq = Vec(config.numLanes, DeqIO(gen.cloneType))
@@ -259,7 +270,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 //  eltPrototype.valid := false.B
 
   val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen))))
-  val writePtr = RegInit(VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))))
+  val writePtr = RegInit(
+    VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W)))
+  )
   val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B)))
 
   private def resetElts = {
@@ -270,7 +283,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
       }
     }
   }
-  when (reset.asBool) {
+  when(reset.asBool) {
     resetElts
   }
 
@@ -286,14 +299,17 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
   // current cycle.
   //
   // shift hint is when the heads have no more coalescable left this or next cycle
-  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) =>
-    c && !(io.invalidate.valid && inv)
-  }.reduce(_ || _)
+  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0)))
+    .map { case (c, inv) =>
+      c && !(io.invalidate.valid && inv)
+    }
+    .reduce(_ || _)
   val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
   // valid && !fire means we enable enqueueing to a full queue, provided the
   // arbiter is taking away all remaining valid queue heads in the next cycle so
   // that we make space for the entire next warp.
-  val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
+  val syncedDeqValidNextCycle =
+    io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
 
   for (i <- 0 until config.numLanes) {
     val enq = io.queue.enq(i)
@@ -313,20 +329,22 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
     // can take new entries if not empty, or if full but shifting
     enq.ready := (!ctrl.full) || ctrl.shift
 
-    when (ctrl.shift) {
+    when(ctrl.shift) {
       // shift, invalidate tail, invalidate coalesced requests
       elts(i).zipWithIndex.foreach { case (elt, j) =>
         if (j == entries - 1) { // tail
           elt.valid := false.B
         } else {
           elt.bits := elts(i)(j + 1).bits
-          elt.valid := elts(i)(j + 1).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
+          elt.valid := elts(i)(
+            j + 1
+          ).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
         }
       }
       // reset dequeue mask when new entries are shifted in
       deqDone(i) := false.B
       // enqueue
-      when (enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
+      when(enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
         elts(i)(writePtr(i) - 1.U).bits := enq.bits
         elts(i)(writePtr(i) - 1.U).valid := enq.valid
       }.otherwise {
@@ -334,13 +352,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
       }
     }.otherwise {
       // invalidate coalesced requests
-      when (io.invalidate.valid) {
+      when(io.invalidate.valid) {
         (elts(i) zip io.invalidate.bits(i).asBools).map { case (elt, inv) =>
           elt.valid := elt.valid && !inv
         }
       }
       // enqueue
-      when (enq.ready && syncedEnqValid) {
+      when(enq.ready && syncedEnqValid) {
         elts(i)(writePtr(i)).bits := enq.bits
         elts(i)(writePtr(i)).valid := enq.valid
         writePtr(i) := writePtr(i) + 1.U
@@ -352,8 +370,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
   // When doing spatial-only coalescing, queues should never drift from each
   // other, i.e. the queue heads should always contain mem requests from the
   // same instruction.
-  val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
-    writePtr.map(_ === writePtr.head).reduce(_ && _)
+  val queueInSync =
+    controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
+      writePtr.map(_ === writePtr.head).reduce(_ && _)
   assert(queueInSync, "shift queue lanes are not in sync")
 
   io.mask := elts.map(x => VecInit(x.map(_.valid)).asUInt)
@@ -361,8 +380,11 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 }
 
 // Software model: coalescer.py
-class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedRequest],
-                    config: CoalescerConfig) extends Module {
+class MonoCoalescer(
+    coalLogSize: Int,
+    windowT: CoalShiftQueue[NonCoalescedRequest],
+    config: CoalescerConfig
+) extends Module {
   val io = IO(new Bundle {
     val window = Input(windowT.io.cloneType)
     val results = Output(new Bundle {
@@ -371,8 +393,10 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
       val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
       // number of entries matched with this leader lane's head.
       // maximum is numLanes * queueDepth
-      val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
-      val coverageHits = Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
+      val matchCount =
+        Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
+      val coverageHits =
+        Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
       val canCoalesce = Output(Vec(config.numLanes, Bool()))
     })
   })
@@ -386,9 +410,13 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
   val leadersValid = io.window.mask.map(_.asBools.head)
 
   def printQueueHeads = {
-    leaders.zipWithIndex.foreach{ case (head, i) =>
-      printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
-        leadersValid(i), head.source, head.address)
+    leaders.zipWithIndex.foreach { case (head, i) =>
+      printf(
+        s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
+        leadersValid(i),
+        head.source,
+        head.address
+      )
     }
   }
   // when (leadersValid.reduce(_ || _)) {
@@ -406,34 +434,42 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
   // Gives a 2-D table of Bools representing match at every queue entry,
   // for each lane (so 3-D in total).
   // dimensions: (leader lane, follower lane, follower entry)
-  val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) =>
-    (io.window.elts zip io.window.mask).map { case (followers, followerValids) =>
-      // compare leader's head against follower's every queue entry
-      (followers zip followerValids.asBools).map { case (follower, followerValid) =>
-        canMatch(follower, followerValid, leader, leaderValid)
-        // FIXME: disabling halving optimization because it does not give the
-        // correct per-lane coalescable indication to the shift queue
-          // // match leader to only followers at lanes >= leader idx
-          // // this halves the number of comparators
-          // if (followerIndex < leaderIndex) false.B
-          // else canMatch(follower, followerValid, leader, leaderValid)
+  val matchTablePerLane = (leaders zip leadersValid).map {
+    case (leader, leaderValid) =>
+      (io.window.elts zip io.window.mask).map {
+        case (followers, followerValids) =>
+          // compare leader's head against follower's every queue entry
+          (followers zip followerValids.asBools).map {
+            case (follower, followerValid) =>
+              canMatch(follower, followerValid, leader, leaderValid)
+            // FIXME: disabling halving optimization because it does not give the
+            // correct per-lane coalescable indication to the shift queue
+            // // match leader to only followers at lanes >= leader idx
+            // // this halves the number of comparators
+            // if (followerIndex < leaderIndex) false.B
+            // else canMatch(follower, followerValid, leader, leaderValid)
+          }
       }
-    }
   }
 
   val matchCounts = matchTablePerLane.map(table =>
-      table.map(PopCount(_)) // sum up each column
-           .reduce(_ +& _))
+    table
+      .map(PopCount(_)) // sum up each column
+      .reduce(_ +& _)
+  )
   val canCoalesce = matchCounts.map(_ > 1.U)
 
   // Elect the leader that has the most match counts.
   // TODO: potentially expensive: magnitude comparator
   def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
-    matchCounts.zipWithIndex.map {
-      case (c, i) => (c, i.U)
-    }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
+    matchCounts.zipWithIndex
+      .map { case (c, i) =>
+        (c, i.U)
+      }
+      .reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
         (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
-    }._2
+      }
+      ._2
   }
   // Elect leader by choosing the smallest-index lane that has a valid
   // match, i.e. using priority encoder.
@@ -444,7 +480,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
 
   val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux
   // matchTable for the chosen lane, but converted to a Vec[UInt]
-  val chosenMatches = VecInit(matchTablePerLane.map{ table =>
+  val chosenMatches = VecInit(matchTablePerLane.map { table =>
     VecInit(table.map(VecInit(_).asUInt))
   })(chosenLeaderIdx)
   val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)
@@ -452,18 +488,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
   // coverage calculation
   def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordSizeWidth)
   // 2-D table flattened to 1-D
-  val offsets = io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
+  val offsets =
+    io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
   val valids = chosenMatches.flatMap(_.asBools)
   // indicates for each word in the coalesced chunk whether it is accessed by
   // any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four
   // words in the coalesced data coming back will be accessed by some request
   // and we've reached 100% bandwidth utilization.
   val hits = Seq.tabulate(1 << (size - config.wordSizeWidth)) { target =>
-    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+    (offsets zip valids)
+      .map { case (offset, valid) => valid && (offset === target.U) }
+      .reduce(_ || _)
   }
 
   // debug prints
-  when (leadersValid.reduce(_ || _)) {
+  when(leadersValid.reduce(_ || _)) {
     matchCounts.zipWithIndex.foreach { case (count, i) =>
       printf(s"lane[${i}] matchCount = %d\n", count);
     }
@@ -492,20 +531,26 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
 // coalesced request out of all possible combinations.
 //
 // Software model: coalescer.py
-class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Request,
-                     config: CoalescerConfig) extends Module {
+class MultiCoalescer(
+    windowT: CoalShiftQueue[NonCoalescedRequest],
+    coalReqT: Request,
+    config: CoalescerConfig
+) extends Module {
   val io = IO(new Bundle {
     // coalescing window, connected to the contents of the request queues
     val window = Input(windowT.io.cloneType)
     // generated coalesced request
     val coalReq = DecoupledIO(coalReqT.cloneType)
     // invalidate signals going into each request queue's head
-    val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
+    val invalidate =
+      Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
     // whether a lane is coalescable
     val coalescable = Output(Vec(config.numLanes, Bool()))
   })
 
-  val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
+  val coalescers = config.coalLogSizes.map(size =>
+    Module(new MonoCoalescer(size, windowT, config))
+  )
   coalescers.foreach(_.io.window := io.window)
 
   def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
@@ -530,9 +575,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
   val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
   val chosenValid = Wire(Bool())
   // minimum 25% coverage
-  val minCoverage = 1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
+  val minCoverage =
+    1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
 
-  when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
+  when(normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
     chosenSizeIdx := argMax(normalizedHits)
     chosenValid := true.B
     printf("coalescing success by coverage policy\n")
@@ -562,9 +608,14 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
   val flatMatches = chosenBundle.matchOH.flatMap(_.asBools)
 
   // check for word alignment in addresses
-  assert(io.window.elts.flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U)).zip(
-    io.window.mask.flatMap(_.asBools)).map { case (aligned, valid) => (!valid) || aligned }.reduce(_ || _),
-    "one or more addresses used for coalescing is not word-aligned")
+  assert(
+    io.window.elts
+      .flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U))
+      .zip(io.window.mask.flatMap(_.asBools))
+      .map { case (aligned, valid) => (!valid) || aligned }
+      .reduce(_ || _),
+    "one or more addresses used for coalescing is not word-aligned"
+  )
 
   // note: this is word-level coalescing. if finer granularity is needed, need to modify code
   val numWords = (1.U << (chosenSize - config.wordSizeWidth.U)).asUInt
@@ -579,18 +630,29 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
     val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
       // note: ANDing against addrMask is to conform to active byte lanes requirements
       // if aligning to LSB suffices, we should add the bitwise AND back
-      m && ((req.address(config.maxCoalLogSize - 1, config.wordSizeWidth)/* & addrMask*/) === i.U)
+      m && ((req.address(
+        config.maxCoalLogSize - 1,
+        config.wordSizeWidth
+      ) /* & addrMask*/ ) === i.U)
     }
     // TODO: SW uses priority encoder, not sure about behavior of MuxCase
-    data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) =>
-      s -> req.data
-    })
-    mask(i) := MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) =>
-      s -> req.mask
-    })
+    data(i) := MuxCase(
+      DontCare,
+      flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.data
+      }
+    )
+    mask(i) := MuxCase(
+      0.U,
+      flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.mask
+      }
+    )
   }
 
-  val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds)))
+  val sourceGen = Module(
+    new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds))
+  )
   sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created
   sourceGen.io.reclaim.valid := false.B // not used
   sourceGen.io.reclaim.bits := DontCare // not used
@@ -608,7 +670,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
   io.invalidate.bits := chosenBundle.matchOH
   io.invalidate.valid := io.coalReq.fire // invalidate only when fire
 
-  io.coalescable := coalescers.map(_.io.results.canCoalesce.asUInt).reduce(_ | _).asBools
+  io.coalescable := coalescers
+    .map(_.io.results.canCoalesce.asUInt)
+    .reduce(_ | _)
+    .asBools
 
   dontTouch(io.invalidate) // debug
 
@@ -620,21 +685,30 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
   if (!config.enable) disable
 }
 
-class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) {
-  require(outer.cpuNode.in.length == config.numLanes,
+class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
+    extends LazyModuleImp(outer) {
+  require(
+    outer.cpuNode.in.length == config.numLanes,
     s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " +
-    s"config.numLanes (${config.numLanes})")
-  require(outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
+      s"config.numLanes (${config.numLanes})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
     s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " +
-    s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})")
-  require(outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
+      s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
     s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
-    s"mismatch with config.addressWidth (${config.addressWidth})")
+      s"mismatch with config.addressWidth (${config.addressWidth})"
+  )
 
   val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
   // note we are using word size. assuming all coalescer inputs are word sized
   val reqQueueEntryT = new NonCoalescedRequest(config)
-  val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config))
+  val reqQueues = Module(
+    new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config)
+  )
 
   val coalReqT = new CoalescedRequest(config)
   val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
@@ -710,7 +784,6 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
   tlCoal.e.valid := false.B
 
-
   // ===========================================================================
   // Response flow
   // ===========================================================================
@@ -723,8 +796,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   val numPerLaneReqs = config.queueDepth
 
   // FIXME: no need to contain maxCoalLogSize data
-  val respQueueEntryT = new Response(oldSourceWidth, log2Ceil(config.maxCoalLogSize),
-    (1 << config.maxCoalLogSize) * 8)
+  val respQueueEntryT = new Response(
+    oldSourceWidth,
+    log2Ceil(config.maxCoalLogSize),
+    (1 << config.maxCoalLogSize) * 8
+  )
   val respQueues = Seq.tabulate(config.numLanes) { _ =>
     Module(
       new MultiPortQueue(
@@ -810,12 +886,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   val newEntry = Wire(uncoalescer.inflightTable.entryT)
   newEntry.source := coalescer.io.coalReq.bits.source
 
-  assert (config.maxCoalLogSize <= config.dataBusWidth,
-    "multi-beat coalesced reads/writes are currently not supported")
-  assert (
+  assert(
+    config.maxCoalLogSize <= config.dataBusWidth,
+    "multi-beat coalesced reads/writes are currently not supported"
+  )
+  assert(
     tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
     s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
-    + s" (${(1 << config.dataBusWidth) * 8})"
+      + s" (${(1 << config.dataBusWidth) * 8})"
   )
   val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
   // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
@@ -825,8 +903,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
       (laneEntry.reqs zip laneInv.asBools).zipWithIndex
         .foreach { case ((reqEntry, inv), i) =>
           val req = reqQueues.io.elts(lane)(i)
-          when ((coalescer.io.invalidate.valid && inv)) {
-            printf(s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", req.source)
+          when((coalescer.io.invalidate.valid && inv)) {
+            printf(
+              s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
+              req.source
+            )
           }
           reqEntry.valid := (coalescer.io.invalidate.valid && inv)
           reqEntry.source := req.source
@@ -845,22 +926,23 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   tlCoal.d.ready := uncoalescer.io.coalResp.ready
 
   // Connect uncoalescer results back into each lane's response queue
-  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) =>
-    perLaneResps.zipWithIndex.foreach { case (resp, i) =>
-      // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
-      // cache.  This should ideally not happen though.
-      assert(
-        q.io.enq(respQueueUncoalPortOffset + i).ready,
-        s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
-      )
-      q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
-      q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
+  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach {
+    case ((q, perLaneResps), lane) =>
+      perLaneResps.zipWithIndex.foreach { case (resp, i) =>
+        // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
+        // cache.  This should ideally not happen though.
+        assert(
+          q.io.enq(respQueueUncoalPortOffset + i).ready,
+          s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
+        )
+        q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
+        q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
       // debug
       // when (resp.valid) {
       //   printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
       // }
       // dontTouch(q.io.enq(respQueueCoalPortOffset))
-    }
+      }
   }
 
   // Debug
@@ -972,7 +1054,8 @@ class Uncoalescer(config: CoalescerConfig) extends Module {
 // split the coalesced response back to individual per-lane responses with the
 // right metadata.
 class InflightCoalReqTable(config: CoalescerConfig) extends Module {
-  val offsetBits = config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
+  val offsetBits =
+    config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
   val entryT = new InflightCoalReqTableEntry(
     config.numLanes,
     config.queueDepth,
@@ -1019,7 +1102,7 @@ class InflightCoalReqTable(config: CoalescerConfig) extends Module {
   }
 
   val full = Wire(Bool())
-  full := (0 until entries).map( table(_).valid ).reduce( _ && _ )
+  full := (0 until entries).map(table(_).valid).reduce(_ && _)
   assert(!full, "inflight table is full and blocking coalescer")
   dontTouch(full)
 
@@ -1094,8 +1177,12 @@ object TLUtils {
 // `traceHasSource` is true if the input trace file has an additional source
 // ID column.  This is useful for using the output trace file genereated by
 // MemTraceLogger as the driver.
-class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource: Boolean = false)
-  (implicit p: Parameters) extends LazyModule {
+class MemTraceDriver(
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean = false
+)(implicit p: Parameters)
+    extends LazyModule {
   // Create N client nodes together
   val laneNodes = Seq.tabulate(config.numLanes) { i =>
     val clientParam = Seq(
@@ -1113,7 +1200,8 @@ class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource:
   val node = TLIdentityNode()
   laneNodes.foreach { l => node := l }
 
-  lazy val module = new MemTraceDriverImp(this, config, filename, traceHasSource)
+  lazy val module =
+    new MemTraceDriverImp(this, config, filename, traceHasSource)
 }
 
 trait HasTraceLine {
@@ -1136,9 +1224,12 @@ class TraceLine extends Bundle with HasTraceLine {
   val data = UInt(64.W)
 }
 
-class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename: String,
-  traceHasSource: Boolean)
-    extends LazyModuleImp(outer)
+class MemTraceDriverImp(
+    outer: MemTraceDriver,
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean
+) extends LazyModuleImp(outer)
     with UnitTestModule {
   // Current cycle mark to read from trace
   val traceReadCycle = RegInit(1.U(64.W))
@@ -1176,7 +1267,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
 
   // Not all fire because trace cycle has to advance even when there is no valid
   // line in the trace.
-  when (reqQueueAllReady){
+  when(reqQueueAllReady) {
     traceReadCycle := traceReadCycle + 1.U
   }
 
@@ -1216,11 +1307,16 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
     sizeInBytes := (1.U) << req.size
     mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
     wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
-    val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
+    val wordAlignedAddress =
+      req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
     val wordAlignedSize = Mux(subword, 2.U, req.size)
 
-    val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numOldSrcIds),
-      ignoreInUse = false))
+    val sourceGen = Module(
+      new RoundRobinSourceGenerator(
+        log2Ceil(config.numOldSrcIds),
+        ignoreInUse = false
+      )
+    )
     sourceGen.io.gen := reqQ.io.deq.fire
     // assert(sourceGen.io.id.valid)
 
@@ -1229,7 +1325,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
       toAddress = hashToValidPhyAddr(wordAlignedAddress),
       lgSize = wordAlignedSize, // trace line already holds log2(size)
       // data should be aligned to beatBytes
-      data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
+      data =
+        (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
     )
     val (glegal, gbits) = edge.Get(
       fromSource = sourceGen.io.id.bits,
@@ -1240,7 +1337,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
     val bits = Mux(req.is_store, pbits, gbits)
 
     tlOut.a.valid := (reqQ.io.deq.valid && sourceGen.io.id.valid)
-    when (tlOut.a.valid) {
+    when(tlOut.a.valid) {
       assert(legal, "illegal TL req gen")
     }
     tlOut.a.bits := bits
@@ -1288,9 +1385,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
 
 class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
     extends BlackBox(
-      Map("FILENAME" -> filename,
-          "NUM_LANES" -> numLanes,
-          "HAS_SOURCE" -> (if (traceHasSource) 1 else 0))
+      Map(
+        "FILENAME" -> filename,
+        "NUM_LANES" -> numLanes,
+        "HAS_SOURCE" -> (if (traceHasSource) 1 else 0)
+      )
     )
     with HasBlackBoxResource {
   val traceLineT = new TraceLine
@@ -1304,19 +1403,20 @@ class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
 
     // These names have to match declarations in the Verilog code, eg.
     // trace_read_address.
-    val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source
-      val ready = Input(Bool())
-      val valid = Output(UInt(numLanes.W))
-      // Chisel can't interface with Verilog 2D port, so flatten all lanes into
-      // single wide 1D array.
-      // TODO: assumes 64-bit address.
-      val cycle = Input(UInt(64.W))
-      val address = Output(UInt((addrW * numLanes).W))
-      val is_store = Output(UInt(numLanes.W))
-      val size = Output(UInt((sizeW * numLanes).W))
-      val data = Output(UInt((dataW * numLanes).W))
-      val finished = Output(Bool())
-    }
+    val trace_read =
+      new Bundle { // can't use HasTraceLine because this doesn't have source
+        val ready = Input(Bool())
+        val valid = Output(UInt(numLanes.W))
+        // Chisel can't interface with Verilog 2D port, so flatten all lanes into
+        // single wide 1D array.
+        // TODO: assumes 64-bit address.
+        val cycle = Input(UInt(64.W))
+        val address = Output(UInt((addrW * numLanes).W))
+        val is_store = Output(UInt(numLanes.W))
+        val size = Output(UInt((sizeW * numLanes).W))
+        val data = Output(UInt((dataW * numLanes).W))
+        val finished = Output(Bool())
+      }
   })
 
   addResource("/vsrc/SimMemTrace.v")
@@ -1443,11 +1543,11 @@ class MemTraceLogger(
 
         // This assert only holds true for PutFullData and not PutPartialData,
         // where HIGH bits in the mask may not be contiguous.
-        when (tlIn.a.valid) {
+        when(tlIn.a.valid) {
           assert(
             PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
             "mask HIGH popcount do not match the TL size. " +
-            "Partial masks are not allowed for PutFull"
+              "Partial masks are not allowed for PutFull"
           )
         }
         val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
@@ -1476,17 +1576,25 @@ class MemTraceLogger(
 
     // stats
     val numReqsThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
     val numRespsThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
     val reqBytesThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
-        b0 + b1
-      }
+      laneReqs
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
+          b0 + b1
+        }
     val respBytesThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
-        b0 + b1
-      }
+      laneResps
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
+          b0 + b1
+        }
     numReqs := numReqs + numReqsThisCycle
     numResps := numResps + numRespsThisCycle
     reqBytes := reqBytes + reqBytesThisCycle
@@ -1496,7 +1604,10 @@ class MemTraceLogger(
     //
     // This is a clunky workaround of the fact that Chisel doesn't allow partial
     // assignment to a bitfield range of a wide signal.
-    def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
+    def flattenTrace(
+        simIO: Bundle with HasTraceLine,
+        perLane: Vec[TraceLine]
+    ) = {
       // these will get optimized out
       val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
       val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
@@ -1592,8 +1703,14 @@ object TLPrintf {
       tlData: UInt,
       reqData: UInt
   ) = {
-    printf(s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
-      source, address, size, mask, is_store)
+    printf(
+      s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
+      source,
+      address,
+      size,
+      mask,
+      is_store
+    )
     when(is_store) {
       printf(", tlData=%x, reqData=%x", tlData, reqData)
     }
@@ -1604,7 +1721,7 @@ object TLPrintf {
 // Synthesizable unit tests
 
 class DummyDriver(config: CoalescerConfig)(implicit p: Parameters)
-  extends LazyModule {
+    extends LazyModule {
   val laneNodes = Seq.tabulate(config.numLanes) { i =>
     val clientParam = Seq(
       TLMasterParameters.v1(
@@ -1640,7 +1757,10 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
     // generate dummy traffic to coalescer to prevent it from being optimized
     // out during synthesis
     val address = Wire(UInt(config.addressWidth.W))
-    address := Cat((finishCounter + (lane.U % 3.U)), 0.U(config.wordSizeWidth.W))
+    address := Cat(
+      (finishCounter + (lane.U % 3.U)),
+      0.U(config.wordSizeWidth.W)
+    )
     val (tl, edge) = node.out(0)
     val (legal, bits) = edge.Put(
       fromSource = sourceIdCounter,
@@ -1657,11 +1777,13 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
     tl.e.valid := false.B
   }
 
-  val dataSum = outer.laneNodes.map { node =>
-    val tl = node.out(0)._1
-    val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
-    data
-  }.reduce (_ +& _)
+  val dataSum = outer.laneNodes
+    .map { node =>
+      val tl = node.out(0)._1
+      val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
+      data
+    }
+    .reduce(_ +& _)
   // this doesn't make much sense, but it prevents the entire uncoalescer from
   // being optimized away
   finishCounter := finishCounter + dataSum
@@ -1680,8 +1802,10 @@ class DummyCoalescer(implicit p: Parameters) extends LazyModule {
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
     )
   )
 
@@ -1704,7 +1828,8 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
 }
 
 // tracedriver --> coalescer --> tracelogger --> tlram
-class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule {
+class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters)
+    extends LazyModule {
   val numLanes = p(SIMTCoreKey).get.nLanes
   val config = defaultConfig.copy(numLanes = numLanes)
 
@@ -1713,14 +1838,18 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
     new MemTraceLogger(numLanes, filename, loggerName = "coreside")
   )
   val coal = LazyModule(new CoalescingUnit(config))
-  val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside"))
+  val memSideLogger = LazyModule(
+    new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")
+  )
   val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
     LazyModule(
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
     )
   )
 
@@ -1751,8 +1880,9 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
   }
 }
 
-class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters)
-    extends UnitTest(timeout) {
+class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit
+    p: Parameters
+) extends UnitTest(timeout) {
   val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
@@ -1770,8 +1900,10 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << defaultConfig.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << defaultConfig.dataBusWidth)
+      )
     )
   )
 
@@ -1785,13 +1917,13 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
   }
 }
 
-class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) {
+class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
+    extends UnitTest(timeout) {
   val dut = Module(LazyModule(new TLRAMCoalescer).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
 }
 
-
 ////////////
 ////////////
 ////////////
@@ -1941,11 +2073,3 @@ class CoalescerXbarImpl(outer: CoalescerXbar,
 
 
   }
-
-
-
-
-
-
-
-

From 772deda9c2e9fd6c76b85d8dd2e1693e7089753e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 16:20:01 -0700
Subject: [PATCH 05/10] Fix ChiselEnum experimental warning

---
 src/main/scala/tilelink/Coalescing.scala | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 0e72dae..98eb668 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -4,7 +4,6 @@ package freechips.rocketchip.tilelink
 
 import chisel3._
 import chisel3.util._
-import chisel3.experimental.ChiselEnum
 import org.chipsalliance.cde.config.{Parameters, Field}
 import freechips.rocketchip.diplomacy._
 // import freechips.rocketchip.devices.tilelink.TLTestRAM
@@ -381,9 +380,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
 
 // Software model: coalescer.py
 class MonoCoalescer(
+    config: CoalescerConfig,
     coalLogSize: Int,
-    windowT: CoalShiftQueue[NonCoalescedRequest],
-    config: CoalescerConfig
+    windowT: CoalShiftQueue[NonCoalescedRequest]
 ) extends Module {
   val io = IO(new Bundle {
     val window = Input(windowT.io.cloneType)
@@ -532,9 +531,9 @@ class MonoCoalescer(
 //
 // Software model: coalescer.py
 class MultiCoalescer(
+    config: CoalescerConfig,
     windowT: CoalShiftQueue[NonCoalescedRequest],
     coalReqT: Request,
-    config: CoalescerConfig
 ) extends Module {
   val io = IO(new Bundle {
     // coalescing window, connected to the contents of the request queues
@@ -549,7 +548,7 @@ class MultiCoalescer(
   })
 
   val coalescers = config.coalLogSizes.map(size =>
-    Module(new MonoCoalescer(size, windowT, config))
+    Module(new MonoCoalescer(config, size, windowT))
   )
   coalescers.foreach(_.io.window := io.window)
 
@@ -704,14 +703,13 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
   )
 
   val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
-  // note we are using word size. assuming all coalescer inputs are word sized
   val reqQueueEntryT = new NonCoalescedRequest(config)
   val reqQueues = Module(
     new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config)
   )
 
   val coalReqT = new CoalescedRequest(config)
-  val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
+  val coalescer = Module(new MultiCoalescer(config, reqQueues, coalReqT))
   coalescer.io.window := reqQueues.io
   reqQueues.io.coalescable := coalescer.io.coalescable
   reqQueues.io.invalidate := coalescer.io.invalidate
@@ -955,7 +953,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
 }
 
 class Uncoalescer(config: CoalescerConfig) extends Module {
-  // notes to hansung:
+  // Mapping to reference model param names
   //  val numLanes: Int, <-> config.NUM_LANES
   //  val numPerLaneReqs: Int, <-> config.DEPTH
   //  val sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)

From 0df319288275f3b683ff2301c921c1e84a0a225c Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 17:31:51 -0700
Subject: [PATCH 06/10] Revamp uncoalescer IO

Connect coalescer output directly to the uncoalescer at the toplevel, and do
table entry construction entirely inside the module.
WIP: unittest is very broken as a result of this.
---
 src/main/scala/tilelink/Coalescing.scala | 196 ++++++++++++-----------
 1 file changed, 105 insertions(+), 91 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 98eb668..e7e69f8 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -41,6 +41,13 @@ object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum {
   }
 }
 
+// Mapping to reference model param names
+//  numLanes: Int, <-> config.NUM_LANES
+//  numPerLaneReqs: Int, <-> config.DEPTH
+//  sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)
+//  sizeWidth: Int, <-> config.sizeEnum.width
+//  coalDataWidth: Int, <-> (1 << config.MAX_SIZE)
+//  numInflightCoalRequests: Int <-> config.NUM_NEW_IDS
 case class CoalescerConfig(
   enable: Boolean,        // globally enable or disable coalescing
   numLanes: Int,          // number of lanes (or threads) in a warp
@@ -137,7 +144,8 @@ class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends La
 // Protocol-agnostic bundles that represent a request and a response to the
 // coalescer.
 
-class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int) extends Bundle {
+class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int)
+    extends Bundle {
   require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
   val op = UInt(1.W) // 0=READ 1=WRITE
   val address = UInt(addressWidth.W)
@@ -181,6 +189,7 @@ case class CoalescedRequest(config: CoalescerConfig)
 
 class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int)
     extends Bundle {
+  require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
   val op = UInt(1.W) // 0=READ 1=WRITE
   val size = UInt(sizeWidth.W)
   val source = UInt(sourceWidth.W)
@@ -382,10 +391,10 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
 class MonoCoalescer(
     config: CoalescerConfig,
     coalLogSize: Int,
-    windowT: CoalShiftQueue[NonCoalescedRequest]
+    queueT: CoalShiftQueue[NonCoalescedRequest]
 ) extends Module {
   val io = IO(new Bundle {
-    val window = Input(windowT.io.cloneType)
+    val window = Input(queueT.io.cloneType)
     val results = Output(new Bundle {
       val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W))
       val baseAddr = Output(UInt(config.addressWidth.W))
@@ -478,7 +487,8 @@ class MonoCoalescer(
   val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts)
 
   val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux
-  // matchTable for the chosen lane, but converted to a Vec[UInt]
+  // matchTable for the chosen lane, but each column converted to bitflags,
+  // i.e. Vec[UInt]
   val chosenMatches = VecInit(matchTablePerLane.map { table =>
     VecInit(table.map(VecInit(_).asUInt))
   })(chosenLeaderIdx)
@@ -532,23 +542,25 @@ class MonoCoalescer(
 // Software model: coalescer.py
 class MultiCoalescer(
     config: CoalescerConfig,
-    windowT: CoalShiftQueue[NonCoalescedRequest],
+    queueT: CoalShiftQueue[NonCoalescedRequest],
     coalReqT: Request,
 ) extends Module {
+  val invalidateT = Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))
   val io = IO(new Bundle {
     // coalescing window, connected to the contents of the request queues
-    val window = Input(windowT.io.cloneType)
+    val window = Input(queueT.io.cloneType)
     // generated coalesced request
     val coalReq = DecoupledIO(coalReqT.cloneType)
-    // invalidate signals going into each request queue's head
-    val invalidate =
-      Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
-    // whether a lane is coalescable
+    // invalidate signals going into each request queue's head.  Lanes with
+    // high invalidate bits are what became coalesced into the new request.
+    val invalidate = Output(invalidateT)
+    // whether a lane is coalescable.  This is used to output non-coalescable
+    // lanes to the arbiter so they can be flushed to downstream.
     val coalescable = Output(Vec(config.numLanes, Bool()))
   })
 
   val coalescers = config.coalLogSizes.map(size =>
-    Module(new MonoCoalescer(config, size, windowT))
+    Module(new MonoCoalescer(config, size, queueT))
   )
   coalescers.foreach(_.io.window := io.window)
 
@@ -701,11 +713,15 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
     s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
       s"mismatch with config.addressWidth (${config.addressWidth})"
   )
+  require(
+    config.maxCoalLogSize <= config.dataBusWidth,
+    "multi-beat coalesced reads/writes are currently not supported"
+  )
 
   val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
-  val reqQueueEntryT = new NonCoalescedRequest(config)
+  val nonCoalReqT = new NonCoalescedRequest(config)
   val reqQueues = Module(
-    new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config)
+    new CoalShiftQueue(nonCoalReqT, config.queueDepth, config)
   )
 
   val coalReqT = new CoalescedRequest(config)
@@ -725,7 +741,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
   (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
     case (((tlIn, _), (tlOut, edgeOut)), lane) =>
       // Request queue
-      val req = Wire(reqQueueEntryT)
+      val req = Wire(nonCoalReqT)
 
       req.op := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
       req.source := tlIn.a.bits.source
@@ -782,6 +798,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
   // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
   tlCoal.e.valid := false.B
 
+  require(
+    tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
+    s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
+      + s" (${(1 << config.dataBusWidth) * 8})"
+  )
+
   // ===========================================================================
   // Response flow
   // ===========================================================================
@@ -870,57 +892,21 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
       dontTouch(tlOut.d)
   }
 
-  // Construct new entry for the inflight table
-  // FIXME: don't instantiate inflight table entry type here.  It leaks the table's impl
-  // detail to the coalescer
-
-  // richard: I think a good idea is to pass Valid[ReqQueueEntry] generated by
-  // the coalescer directly into the uncoalescer, so that we can offload the
-  // logic to generate the Inflight Entry into the uncoalescer, where it should be.
-  // this also reduces top level clutter.
-
-  val uncoalescer = Module(new Uncoalescer(config))
-
-  val newEntry = Wire(uncoalescer.inflightTable.entryT)
-  newEntry.source := coalescer.io.coalReq.bits.source
-
-  assert(
-    config.maxCoalLogSize <= config.dataBusWidth,
-    "multi-beat coalesced reads/writes are currently not supported"
-  )
-  assert(
-    tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
-    s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
-      + s" (${(1 << config.dataBusWidth) * 8})"
+  val uncoalescer = Module(
+    new Uncoalescer(config, reqQueues, coalReqT, coalescer.invalidateT)
   )
+  // connect coalesced request that is newly generated and being recorded in
+  // the uncoalescer
+  uncoalescer.io.coalReq <> coalescer.io.coalReq
+  uncoalescer.io.invalidate := coalescer.io.invalidate
   val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
-  // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
-  // coalescer to every (numLanes * queueDepth) entry in the inflight table.
-  (newEntry.lanes zip coalescer.io.invalidate.bits).zipWithIndex
-    .foreach { case ((laneEntry, laneInv), lane) =>
-      (laneEntry.reqs zip laneInv.asBools).zipWithIndex
-        .foreach { case ((reqEntry, inv), i) =>
-          val req = reqQueues.io.elts(lane)(i)
-          when((coalescer.io.invalidate.valid && inv)) {
-            printf(
-              s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
-              req.source
-            )
-          }
-          reqEntry.valid := (coalescer.io.invalidate.valid && inv)
-          reqEntry.source := req.source
-          reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordSizeWidth)
-          reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
-          // TODO: load/store op
-        }
-    }
-  dontTouch(newEntry)
-
-  uncoalescer.io.coalReqValid := coalescer.io.coalReq.valid
-  uncoalescer.io.newEntry := newEntry
+  uncoalescer.io.window := reqQueues.io
+  // connect coalesced response going into the uncoalescer, ready to be
+  // uncoalesced
   // Cleanup: custom <>?
   uncoalescer.io.coalResp.valid := tlCoal.d.valid
   uncoalescer.io.coalResp.bits.fromTLD(tlCoal.d.bits)
+  // uncoalescer backpressure
   tlCoal.d.ready := uncoalescer.io.coalResp.ready
 
   // Connect uncoalescer results back into each lane's response queue
@@ -935,11 +921,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
         )
         q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
         q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
-      // debug
-      // when (resp.valid) {
-      //   printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
-      // }
-      // dontTouch(q.io.enq(respQueueCoalPortOffset))
+        // debug
+        // when (resp.valid) {
+        //   printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
+        // }
+        // dontTouch(q.io.enq(respQueueCoalPortOffset))
       }
   }
 
@@ -952,48 +938,76 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
   dontTouch(tlCoal.d)
 }
 
-class Uncoalescer(config: CoalescerConfig) extends Module {
-  // Mapping to reference model param names
-  //  val numLanes: Int, <-> config.NUM_LANES
-  //  val numPerLaneReqs: Int, <-> config.DEPTH
-  //  val sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)
-  //  val sizeWidth: Int, <-> config.sizeEnum.width
-  //  val coalDataWidth: Int, <-> (1 << config.MAX_SIZE)
-  //  val numInflightCoalRequests: Int <-> config.NUM_NEW_IDS
+class Uncoalescer(
+    config: CoalescerConfig,
+    queueT: CoalShiftQueue[NonCoalescedRequest],
+    coalReqT: Request,
+    coalInvalidateT: Valid[Vec[UInt]],
+) extends Module {
   val inflightTable = Module(new InflightCoalReqTable(config))
   val io = IO(new Bundle {
-    val coalReqValid = Input(Bool())
-    // FIXME: receive ReqQueueEntry and construct newEntry inside uncoalescer
-    val newEntry = Input(inflightTable.entryT.cloneType)
+    // generated coalesced request, connected to the output of the coalescer.
+    val coalReq = Flipped(DecoupledIO(coalReqT.cloneType))
+    // invalidate signal coming out of coalescer.
+    val invalidate = Input(coalInvalidateT.cloneType)
+    // coalescing window, connected to the contents of the request queues.
+    // Uncoalescer looks at the queue entries that got coalesced into `coalReq`
+    // in order to record which lanes this coalReq originally came from.
+    val window = Input(queueT.io.cloneType)
     val coalResp = Flipped(Decoupled(new CoalescedResponse(config)))
     val uncoalResps = Output(
       Vec(
         config.numLanes,
-        Vec(
-          config.queueDepth,
-          ValidIO(
-            new NonCoalescedResponse(config)
-          )
-        )
+        Vec(config.queueDepth, ValidIO(new NonCoalescedResponse(config)))
       )
     )
   })
 
-  // Populate inflight table
-  inflightTable.io.enq.valid := io.coalReqValid
-  inflightTable.io.enq.bits := io.newEntry
+  // Uncoalescer has to be always ready to accept and record new coalesced
+  // requests, so that it doesn't stall the coalescer.
+  io.coalReq.ready := true.B
+
+  // Construct a new entry for the inflight table using generated coalesced request
+  def generateInflightTableEntry: InflightCoalReqTableEntry = {
+    val newEntry = Wire(inflightTable.entryT)
+    newEntry.source := io.coalReq.bits.source
+    // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
+    // coalescer to every (numLanes * queueDepth) entry in the inflight table.
+    (newEntry.lanes zip io.invalidate.bits).zipWithIndex
+      .foreach { case ((laneEntry, laneInv), lane) =>
+        (laneEntry.reqs zip laneInv.asBools).zipWithIndex
+          .foreach { case ((reqEntry, inv), i) =>
+            val req = io.window.elts(lane)(i)
+            when((io.invalidate.valid && inv)) {
+              printf(
+                s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
+                req.source
+              )
+            }
+            reqEntry.valid := (io.invalidate.valid && inv)
+            reqEntry.source := req.source
+            reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordSizeWidth)
+            reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
+            // TODO: load/store op
+          }
+      }
+    assert(
+      !((io.coalReq.valid === true.B) && (io.coalResp.valid === true.B) &&
+        (newEntry.source === io.coalResp.bits.source)),
+      "inflight table: enqueueing and looking up the same srcId at the same cycle is not handled"
+    )
+    dontTouch(newEntry)
+
+    newEntry
+  }
+  inflightTable.io.enq.valid := io.coalReq.valid
+  inflightTable.io.enq.bits := generateInflightTableEntry
 
   // Look up the table with incoming coalesced responses
   inflightTable.io.lookup.ready := io.coalResp.valid
   inflightTable.io.lookupSourceId := io.coalResp.bits.source
   io.coalResp.ready := true.B // FIXME, see sw model implementation
 
-  assert(
-    !((io.coalReqValid === true.B) && (io.coalResp.valid === true.B) &&
-      (io.newEntry.source === io.coalResp.bits.source)),
-    "inflight table: enqueueing and looking up the same srcId at the same cycle is not handled"
-  )
-
   // Un-coalescing logic
   //
   def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, logSize: UInt): UInt = {

From df68bfec844b11dfb59a0ef61c2bfdb381468e30 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 18:20:19 -0700
Subject: [PATCH 07/10] Remove module dependency for uncoalescer instantiation

for easier unittesting. now builds.
---
 src/main/scala/tilelink/Coalescing.scala      |  18 +-
 .../scala/coalescing/CoalescingUnitTest.scala | 218 ++++++++++--------
 2 files changed, 129 insertions(+), 107 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index e7e69f8..f309b9f 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -893,14 +893,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
   }
 
   val uncoalescer = Module(
-    new Uncoalescer(config, reqQueues, coalReqT, coalescer.invalidateT)
+    new Uncoalescer(config, nonCoalReqT, coalReqT)
   )
   // connect coalesced request that is newly generated and being recorded in
   // the uncoalescer
   uncoalescer.io.coalReq <> coalescer.io.coalReq
   uncoalescer.io.invalidate := coalescer.io.invalidate
   val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
-  uncoalescer.io.window := reqQueues.io
+  uncoalescer.io.windowElts := reqQueues.io.elts
   // connect coalesced response going into the uncoalescer, ready to be
   // uncoalesced
   // Cleanup: custom <>?
@@ -940,20 +940,22 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
 
 class Uncoalescer(
     config: CoalescerConfig,
-    queueT: CoalShiftQueue[NonCoalescedRequest],
-    coalReqT: Request,
-    coalInvalidateT: Valid[Vec[UInt]],
+    nonCoalReqT: NonCoalescedRequest,
+    coalReqT: CoalescedRequest,
 ) extends Module {
   val inflightTable = Module(new InflightCoalReqTable(config))
   val io = IO(new Bundle {
     // generated coalesced request, connected to the output of the coalescer.
     val coalReq = Flipped(DecoupledIO(coalReqT.cloneType))
     // invalidate signal coming out of coalescer.
-    val invalidate = Input(coalInvalidateT.cloneType)
+    val invalidate = Input(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
     // coalescing window, connected to the contents of the request queues.
     // Uncoalescer looks at the queue entries that got coalesced into `coalReq`
     // in order to record which lanes this coalReq originally came from.
-    val window = Input(queueT.io.cloneType)
+    // We only care about window.elts because the coalescer would have made
+    // sure it only looked at the valid entries.
+    // TODO: duplicate type construction
+    val windowElts = Input(Vec(config.numLanes, Vec(config.queueDepth, nonCoalReqT)))
     val coalResp = Flipped(Decoupled(new CoalescedResponse(config)))
     val uncoalResps = Output(
       Vec(
@@ -977,7 +979,7 @@ class Uncoalescer(
       .foreach { case ((laneEntry, laneInv), lane) =>
         (laneEntry.reqs zip laneInv.asBools).zipWithIndex
           .foreach { case ((reqEntry, inv), i) =>
-            val req = io.window.elts(lane)(i)
+            val req = io.windowElts(lane)(i)
             when((io.invalidate.valid && inv)) {
               printf(
                 s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala
index 29ea8dc..36f8b13 100644
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -735,125 +735,145 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   val coalDataWidth = 128
   val numInflightCoalRequests = 4
 
+  val config = uncoalescerTestConfig
+
+  val nonCoalReqT = new NonCoalescedRequest(config)
+  val coalReqT = new CoalescedRequest(config)
   it should "work in general case" in {
-    test(new Uncoalescer(uncoalescerTestConfig))
+    test(new Uncoalescer(config, nonCoalReqT, coalReqT))
     // vcs helps with simulation time, but sometimes errors with
     // "mutation occurred during iteration" java error
     // .withAnnotations(Seq(VcsBackendAnnotation))
     { c =>
       val sourceId = 0.U
-      val four = c.io.newEntry.sizeEnumT.FOUR
-      c.io.coalReqValid.poke(true.B)
-      c.io.newEntry.source.poke(sourceId)
-      c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(0).source.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(1).source.poke(2.U)
-      c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) // same offset to different lanes
-      c.io.newEntry.lanes(0).reqs(1).sizeEnum.poke(four)
-      c.io.newEntry.lanes(1).reqs(0).valid.poke(false.B)
-      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(2).reqs(1).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(1).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(1).offset.poke(3.U)
-      c.io.newEntry.lanes(2).reqs(1).sizeEnum.poke(four)
-      c.io.newEntry.lanes(3).reqs(0).valid.poke(false.B)
+      // val four = c.io.newEntry.sizeEnumT.FOUR
+      c.io.coalReq.valid.poke(true.B)
+      c.io.windowElts(0)(0).op.poke(0.U)
+      c.io.windowElts(0)(0).source.poke(1.U)
+      c.io.windowElts(0)(0).address.poke(0x4.U)
+      c.io.windowElts(0)(0).size.poke(2.U)
+      c.io.windowElts(0)(1).op.poke(0.U)
+      c.io.windowElts(0)(1).source.poke(2.U)
+      c.io.windowElts(0)(1).address.poke(0x4.U)
+      c.io.windowElts(0)(1).size.poke(2.U)
+      c.io.windowElts(2)(0).op.poke(0.U)
+      c.io.windowElts(2)(0).source.poke(1.U)
+      c.io.windowElts(2)(0).address.poke(0x4.U)
+      c.io.windowElts(2)(0).size.poke(2.U)
+      c.io.windowElts(2)(1).op.poke(0.U)
+      c.io.windowElts(2)(1).source.poke(2.U)
+      c.io.windowElts(2)(1).address.poke(0x4.U)
+      c.io.windowElts(2)(1).size.poke(2.U)
+      // c.io.newEntry.source.poke(sourceId)
+      // c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
+      // c.io.newEntry.lanes(0).reqs(0).source.poke(1.U)
+      // c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
+      // c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
+      // c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B)
+      // c.io.newEntry.lanes(0).reqs(1).source.poke(2.U)
+      // c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) // same offset to different lanes
+      // c.io.newEntry.lanes(0).reqs(1).sizeEnum.poke(four)
+      // c.io.newEntry.lanes(1).reqs(0).valid.poke(false.B)
+      // c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
+      // c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
+      // c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U)
+      // c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
+      // c.io.newEntry.lanes(2).reqs(1).valid.poke(true.B)
+      // c.io.newEntry.lanes(2).reqs(1).source.poke(2.U)
+      // c.io.newEntry.lanes(2).reqs(1).offset.poke(3.U)
+      // c.io.newEntry.lanes(2).reqs(1).sizeEnum.poke(four)
+      // c.io.newEntry.lanes(3).reqs(0).valid.poke(false.B)
 
-      c.clock.step()
+      // c.clock.step()
 
-      c.io.coalReqValid.poke(false.B)
+      // c.io.coalReqValid.poke(false.B)
 
-      c.clock.step()
+      // c.clock.step()
 
-      c.io.coalResp.valid.poke(true.B)
-      c.io.coalResp.bits.source.poke(sourceId)
-      val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
-      // val lit = BigInt(0x0123456789abcdefL)
-      c.io.coalResp.bits.data.poke(lit.U)
+      // c.io.coalResp.valid.poke(true.B)
+      // c.io.coalResp.bits.source.poke(sourceId)
+      // val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
+      // // val lit = BigInt(0x0123456789abcdefL)
+      // c.io.coalResp.bits.data.poke(lit.U)
 
-      // table lookup is combinational at the same cycle
-      c.io.uncoalResps(0)(0).valid.expect(true.B)
-      c.io.uncoalResps(1)(0).valid.expect(false.B)
-      c.io.uncoalResps(2)(0).valid.expect(true.B)
-      c.io.uncoalResps(3)(0).valid.expect(false.B)
+      // // table lookup is combinational at the same cycle
+      // c.io.uncoalResps(0)(0).valid.expect(true.B)
+      // c.io.uncoalResps(1)(0).valid.expect(false.B)
+      // c.io.uncoalResps(2)(0).valid.expect(true.B)
+      // c.io.uncoalResps(3)(0).valid.expect(false.B)
 
-      // offset is counting from LSB
-      c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
-      c.io.uncoalResps(0)(0).bits.source.expect(1.U)
-      c.io.uncoalResps(0)(1).bits.data.expect(0x5ca1ab1eL.U)
-      c.io.uncoalResps(0)(1).bits.source.expect(2.U)
-      c.io.uncoalResps(2)(0).bits.data.expect(0x89abcdefL.U)
-      c.io.uncoalResps(2)(0).bits.source.expect(2.U)
-      c.io.uncoalResps(2)(1).bits.data.expect(0x01234567L.U)
-      c.io.uncoalResps(2)(1).bits.source.expect(2.U)
+      // // offset is counting from LSB
+      // c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
+      // c.io.uncoalResps(0)(0).bits.source.expect(1.U)
+      // c.io.uncoalResps(0)(1).bits.data.expect(0x5ca1ab1eL.U)
+      // c.io.uncoalResps(0)(1).bits.source.expect(2.U)
+      // c.io.uncoalResps(2)(0).bits.data.expect(0x89abcdefL.U)
+      // c.io.uncoalResps(2)(0).bits.source.expect(2.U)
+      // c.io.uncoalResps(2)(1).bits.data.expect(0x01234567L.U)
+      // c.io.uncoalResps(2)(1).bits.source.expect(2.U)
     }
   }
 
-  it should "uncoalesce when coalesced to the same word offset" in {
-    test(new Uncoalescer(uncoalescerTestConfig))
-    // .withAnnotations(Seq(VcsBackendAnnotation))
-    { c =>
-      val sourceId = 0.U
-      val four = c.io.newEntry.sizeEnumT.FOUR
-      c.io.coalReqValid.poke(true.B)
-      c.io.newEntry.source.poke(sourceId)
-      c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(0).source.poke(0.U)
-      c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(0).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(1).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(1).reqs(0).source.poke(1.U)
-      c.io.newEntry.lanes(1).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(1).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(1).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(2).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(3).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(3).reqs(0).source.poke(3.U)
-      c.io.newEntry.lanes(3).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(3).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(3).reqs(1).valid.poke(false.B)
+  // it should "uncoalesce when coalesced to the same word offset" in {
+  //   test(new Uncoalescer(uncoalescerTestConfig))
+  //   // .withAnnotations(Seq(VcsBackendAnnotation))
+  //   { c =>
+  //     val sourceId = 0.U
+  //     val four = c.io.newEntry.sizeEnumT.FOUR
+  //     c.io.coalReqValid.poke(true.B)
+  //     c.io.newEntry.source.poke(sourceId)
+  //     c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
+  //     c.io.newEntry.lanes(0).reqs(0).source.poke(0.U)
+  //     c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
+  //     c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
+  //     c.io.newEntry.lanes(0).reqs(1).valid.poke(false.B)
+  //     c.io.newEntry.lanes(1).reqs(0).valid.poke(true.B)
+  //     c.io.newEntry.lanes(1).reqs(0).source.poke(1.U)
+  //     c.io.newEntry.lanes(1).reqs(0).offset.poke(1.U)
+  //     c.io.newEntry.lanes(1).reqs(0).sizeEnum.poke(four)
+  //     c.io.newEntry.lanes(1).reqs(1).valid.poke(false.B)
+  //     c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
+  //     c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
+  //     c.io.newEntry.lanes(2).reqs(0).offset.poke(1.U)
+  //     c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
+  //     c.io.newEntry.lanes(2).reqs(1).valid.poke(false.B)
+  //     c.io.newEntry.lanes(3).reqs(0).valid.poke(true.B)
+  //     c.io.newEntry.lanes(3).reqs(0).source.poke(3.U)
+  //     c.io.newEntry.lanes(3).reqs(0).offset.poke(1.U)
+  //     c.io.newEntry.lanes(3).reqs(0).sizeEnum.poke(four)
+  //     c.io.newEntry.lanes(3).reqs(1).valid.poke(false.B)
 
-      c.clock.step()
+  //     c.clock.step()
 
-      c.io.coalReqValid.poke(false.B)
+  //     c.io.coalReqValid.poke(false.B)
 
-      c.clock.step()
+  //     c.clock.step()
 
-      c.io.coalResp.valid.poke(true.B)
-      c.io.coalResp.bits.source.poke(sourceId)
-      val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
-      c.io.coalResp.bits.data.poke(lit.U)
+  //     c.io.coalResp.valid.poke(true.B)
+  //     c.io.coalResp.bits.source.poke(sourceId)
+  //     val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
+  //     c.io.coalResp.bits.data.poke(lit.U)
 
-      // table lookup is combinational at the same cycle
-      // offset is counting from LSB
-      c.io.uncoalResps(0)(0).valid.expect(true.B)
-      c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
-      c.io.uncoalResps(0)(0).bits.source.expect(0.U)
-      c.io.uncoalResps(0)(1).valid.expect(false.B)
-      c.io.uncoalResps(1)(0).valid.expect(true.B)
-      c.io.uncoalResps(1)(0).bits.data.expect(0x5ca1ab1eL.U)
-      c.io.uncoalResps(1)(0).bits.source.expect(1.U)
-      c.io.uncoalResps(1)(1).valid.expect(false.B)
-      c.io.uncoalResps(2)(0).valid.expect(true.B)
-      c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U)
-      c.io.uncoalResps(2)(0).bits.source.expect(2.U)
-      c.io.uncoalResps(2)(1).valid.expect(false.B)
-      c.io.uncoalResps(3)(0).valid.expect(true.B)
-      c.io.uncoalResps(3)(0).bits.data.expect(0x5ca1ab1eL.U)
-      c.io.uncoalResps(3)(0).bits.source.expect(3.U)
-      c.io.uncoalResps(3)(1).valid.expect(false.B)
-    }
-  }
+  //     // table lookup is combinational at the same cycle
+  //     // offset is counting from LSB
+  //     c.io.uncoalResps(0)(0).valid.expect(true.B)
+  //     c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
+  //     c.io.uncoalResps(0)(0).bits.source.expect(0.U)
+  //     c.io.uncoalResps(0)(1).valid.expect(false.B)
+  //     c.io.uncoalResps(1)(0).valid.expect(true.B)
+  //     c.io.uncoalResps(1)(0).bits.data.expect(0x5ca1ab1eL.U)
+  //     c.io.uncoalResps(1)(0).bits.source.expect(1.U)
+  //     c.io.uncoalResps(1)(1).valid.expect(false.B)
+  //     c.io.uncoalResps(2)(0).valid.expect(true.B)
+  //     c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U)
+  //     c.io.uncoalResps(2)(0).bits.source.expect(2.U)
+  //     c.io.uncoalResps(2)(1).valid.expect(false.B)
+  //     c.io.uncoalResps(3)(0).valid.expect(true.B)
+  //     c.io.uncoalResps(3)(0).bits.data.expect(0x5ca1ab1eL.U)
+  //     c.io.uncoalResps(3)(0).bits.source.expect(3.U)
+  //     c.io.uncoalResps(3)(1).valid.expect(false.B)
+  //   }
+  // }
 }
 
 class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester {

From b95b59cce02466f752a639eace9de555bb0e0116 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 18:30:15 -0700
Subject: [PATCH 08/10] Fix uncoalescer unittest

---
 .../scala/coalescing/CoalescingUnitTest.scala | 129 ++++++++----------
 1 file changed, 55 insertions(+), 74 deletions(-)

diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala
index 36f8b13..19205d4 100644
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -706,34 +706,25 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
   }*/
 }
 
-object uncoalescerTestConfig extends CoalescerConfig(
-  enable = true,
-  numLanes = 4,
-  queueDepth = 2,
-  waitTimeout = 8,
-  addressWidth = 24,
-  dataBusWidth = 5,
-  // watermark = 2,
-  wordSizeInBytes = 4,
-  numOldSrcIds = 16,
-  numNewSrcIds = 4,
-  respQueueDepth = 4,
-  coalLogSizes = Seq(4),
-  sizeEnum = DefaultInFlightTableSizeEnum,
-  numCoalReqs = 1,
-  numArbiterOutputPorts = 4,
-  bankStrideInBytes = 64,
-)
-
 class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   behavior of "uncoalescer"
-  val numLanes = 4
-  val numPerLaneReqs = 2
-  val sourceWidth = 2
-  val sizeWidth = 2
-  // 16B coalescing size
-  val coalDataWidth = 128
-  val numInflightCoalRequests = 4
+  object uncoalescerTestConfig extends CoalescerConfig(
+    enable = true,
+    numLanes = 4,
+    queueDepth = 2,
+    waitTimeout = 8,
+    addressWidth = 24,
+    dataBusWidth = 4, // 128 bit data bus
+    wordSizeInBytes = 4,
+    numOldSrcIds = 16,
+    numNewSrcIds = 4,
+    respQueueDepth = 4,
+    coalLogSizes = Seq(4),
+    sizeEnum = DefaultInFlightTableSizeEnum,
+    numCoalReqs = 1,
+    numArbiterOutputPorts = 4,
+    bankStrideInBytes = 64,
+  )
 
   val config = uncoalescerTestConfig
 
@@ -745,72 +736,62 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
     // "mutation occurred during iteration" java error
     // .withAnnotations(Seq(VcsBackendAnnotation))
     { c =>
-      val sourceId = 0.U
-      // val four = c.io.newEntry.sizeEnumT.FOUR
-      c.io.coalReq.valid.poke(true.B)
+      // 4 lanes, queue depth 2
       c.io.windowElts(0)(0).op.poke(0.U)
       c.io.windowElts(0)(0).source.poke(1.U)
       c.io.windowElts(0)(0).address.poke(0x4.U)
       c.io.windowElts(0)(0).size.poke(2.U)
       c.io.windowElts(0)(1).op.poke(0.U)
       c.io.windowElts(0)(1).source.poke(2.U)
-      c.io.windowElts(0)(1).address.poke(0x4.U)
+      c.io.windowElts(0)(1).address.poke(0x4.U) // two reqs from one lane
       c.io.windowElts(0)(1).size.poke(2.U)
       c.io.windowElts(2)(0).op.poke(0.U)
-      c.io.windowElts(2)(0).source.poke(1.U)
-      c.io.windowElts(2)(0).address.poke(0x4.U)
+      c.io.windowElts(2)(0).source.poke(2.U)
+      c.io.windowElts(2)(0).address.poke(0x8.U)
       c.io.windowElts(2)(0).size.poke(2.U)
       c.io.windowElts(2)(1).op.poke(0.U)
       c.io.windowElts(2)(1).source.poke(2.U)
-      c.io.windowElts(2)(1).address.poke(0x4.U)
+      c.io.windowElts(2)(1).address.poke(0xc.U)
       c.io.windowElts(2)(1).size.poke(2.U)
-      // c.io.newEntry.source.poke(sourceId)
-      // c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-      // c.io.newEntry.lanes(0).reqs(0).source.poke(1.U)
-      // c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-      // c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
-      // c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B)
-      // c.io.newEntry.lanes(0).reqs(1).source.poke(2.U)
-      // c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) // same offset to different lanes
-      // c.io.newEntry.lanes(0).reqs(1).sizeEnum.poke(four)
-      // c.io.newEntry.lanes(1).reqs(0).valid.poke(false.B)
-      // c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-      // c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
-      // c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U)
-      // c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
-      // c.io.newEntry.lanes(2).reqs(1).valid.poke(true.B)
-      // c.io.newEntry.lanes(2).reqs(1).source.poke(2.U)
-      // c.io.newEntry.lanes(2).reqs(1).offset.poke(3.U)
-      // c.io.newEntry.lanes(2).reqs(1).sizeEnum.poke(four)
-      // c.io.newEntry.lanes(3).reqs(0).valid.poke(false.B)
+      // indicate lane 0 and 2 are used for coalescing
+      c.io.invalidate.valid.poke(true.B)
+      c.io.invalidate.bits(0).poke(0x3.U) // 2'b11 for depth=2
+      c.io.invalidate.bits(1).poke(0x0.U)
+      c.io.invalidate.bits(2).poke(0x3.U)
+      c.io.invalidate.bits(3).poke(0x0.U)
 
-      // c.clock.step()
+      val sourceId = 0.U
+      c.io.coalReq.valid.poke(true.B)
+      c.io.coalReq.bits.source.poke(sourceId)
+      c.io.coalReq.ready.expect(true.B)
 
-      // c.io.coalReqValid.poke(false.B)
+      c.clock.step()
 
-      // c.clock.step()
+      c.io.coalReq.valid.poke(false.B)
 
-      // c.io.coalResp.valid.poke(true.B)
-      // c.io.coalResp.bits.source.poke(sourceId)
-      // val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
-      // // val lit = BigInt(0x0123456789abcdefL)
-      // c.io.coalResp.bits.data.poke(lit.U)
+      c.clock.step()
 
-      // // table lookup is combinational at the same cycle
-      // c.io.uncoalResps(0)(0).valid.expect(true.B)
-      // c.io.uncoalResps(1)(0).valid.expect(false.B)
-      // c.io.uncoalResps(2)(0).valid.expect(true.B)
-      // c.io.uncoalResps(3)(0).valid.expect(false.B)
+      c.io.coalResp.valid.poke(true.B)
+      c.io.coalResp.bits.source.poke(sourceId)
+      val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
+      // val lit = BigInt(0x0123456789abcdefL)
+      c.io.coalResp.bits.data.poke(lit.U)
 
-      // // offset is counting from LSB
-      // c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
-      // c.io.uncoalResps(0)(0).bits.source.expect(1.U)
-      // c.io.uncoalResps(0)(1).bits.data.expect(0x5ca1ab1eL.U)
-      // c.io.uncoalResps(0)(1).bits.source.expect(2.U)
-      // c.io.uncoalResps(2)(0).bits.data.expect(0x89abcdefL.U)
-      // c.io.uncoalResps(2)(0).bits.source.expect(2.U)
-      // c.io.uncoalResps(2)(1).bits.data.expect(0x01234567L.U)
-      // c.io.uncoalResps(2)(1).bits.source.expect(2.U)
+      // table lookup is combinational at the same cycle
+      c.io.uncoalResps(0)(0).valid.expect(true.B)
+      c.io.uncoalResps(1)(0).valid.expect(false.B)
+      c.io.uncoalResps(2)(0).valid.expect(true.B)
+      c.io.uncoalResps(3)(0).valid.expect(false.B)
+
+      // offset is counting from LSB
+      c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(0)(0).bits.source.expect(1.U)
+      c.io.uncoalResps(0)(1).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(0)(1).bits.source.expect(2.U)
+      c.io.uncoalResps(2)(0).bits.data.expect(0x89abcdefL.U)
+      c.io.uncoalResps(2)(0).bits.source.expect(2.U)
+      c.io.uncoalResps(2)(1).bits.data.expect(0x01234567L.U)
+      c.io.uncoalResps(2)(1).bits.source.expect(2.U)
     }
   }
 

From 226e1d2d84d245e1982606d9a9e4000cf802b8ed Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 18:30:15 -0700
Subject: [PATCH 09/10] Fix uncoalescer unittest even more

---
 .../scala/coalescing/CoalescingUnitTest.scala | 117 ++++++++++--------
 1 file changed, 62 insertions(+), 55 deletions(-)

diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala
index 19205d4..80342a1 100644
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -730,6 +730,7 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
 
   val nonCoalReqT = new NonCoalescedRequest(config)
   val coalReqT = new CoalescedRequest(config)
+
   it should "work in general case" in {
     test(new Uncoalescer(config, nonCoalReqT, coalReqT))
     // vcs helps with simulation time, but sometimes errors with
@@ -768,6 +769,7 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
       c.clock.step()
 
       c.io.coalReq.valid.poke(false.B)
+      c.io.invalidate.valid.poke(false.B)
 
       c.clock.step()
 
@@ -795,66 +797,71 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
     }
   }
 
-  // it should "uncoalesce when coalesced to the same word offset" in {
-  //   test(new Uncoalescer(uncoalescerTestConfig))
-  //   // .withAnnotations(Seq(VcsBackendAnnotation))
-  //   { c =>
-  //     val sourceId = 0.U
-  //     val four = c.io.newEntry.sizeEnumT.FOUR
-  //     c.io.coalReqValid.poke(true.B)
-  //     c.io.newEntry.source.poke(sourceId)
-  //     c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-  //     c.io.newEntry.lanes(0).reqs(0).source.poke(0.U)
-  //     c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-  //     c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
-  //     c.io.newEntry.lanes(0).reqs(1).valid.poke(false.B)
-  //     c.io.newEntry.lanes(1).reqs(0).valid.poke(true.B)
-  //     c.io.newEntry.lanes(1).reqs(0).source.poke(1.U)
-  //     c.io.newEntry.lanes(1).reqs(0).offset.poke(1.U)
-  //     c.io.newEntry.lanes(1).reqs(0).sizeEnum.poke(four)
-  //     c.io.newEntry.lanes(1).reqs(1).valid.poke(false.B)
-  //     c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-  //     c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
-  //     c.io.newEntry.lanes(2).reqs(0).offset.poke(1.U)
-  //     c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
-  //     c.io.newEntry.lanes(2).reqs(1).valid.poke(false.B)
-  //     c.io.newEntry.lanes(3).reqs(0).valid.poke(true.B)
-  //     c.io.newEntry.lanes(3).reqs(0).source.poke(3.U)
-  //     c.io.newEntry.lanes(3).reqs(0).offset.poke(1.U)
-  //     c.io.newEntry.lanes(3).reqs(0).sizeEnum.poke(four)
-  //     c.io.newEntry.lanes(3).reqs(1).valid.poke(false.B)
+  it should "uncoalesce when coalesced to the same word offset" in {
+    test(new Uncoalescer(config, nonCoalReqT, coalReqT))
+    // .withAnnotations(Seq(VcsBackendAnnotation))
+    { c =>
+      // 4 lanes, queue depth 2
+      c.io.windowElts(0)(0).op.poke(0.U)
+      c.io.windowElts(0)(0).source.poke(0.U)
+      c.io.windowElts(0)(0).address.poke(0x4.U)
+      c.io.windowElts(0)(0).size.poke(2.U)
+      c.io.windowElts(1)(0).op.poke(0.U)
+      c.io.windowElts(1)(0).source.poke(1.U)
+      c.io.windowElts(1)(0).address.poke(0x4.U) // two reqs from one lane
+      c.io.windowElts(1)(0).size.poke(2.U)
+      c.io.windowElts(2)(0).op.poke(0.U)
+      c.io.windowElts(2)(0).source.poke(2.U)
+      c.io.windowElts(2)(0).address.poke(0x4.U)
+      c.io.windowElts(2)(0).size.poke(2.U)
+      c.io.windowElts(3)(0).op.poke(0.U)
+      c.io.windowElts(3)(0).source.poke(3.U)
+      c.io.windowElts(3)(0).address.poke(0x4.U)
+      c.io.windowElts(3)(0).size.poke(2.U)
+      // indicate lanes used for coalescing
+      c.io.invalidate.valid.poke(true.B)
+      c.io.invalidate.bits(0).poke(0x1.U) // 2'b01 for enabling head
+      c.io.invalidate.bits(1).poke(0x1.U)
+      c.io.invalidate.bits(2).poke(0x1.U)
+      c.io.invalidate.bits(3).poke(0x1.U)
 
-  //     c.clock.step()
+      val sourceId = 0.U
+      c.io.coalReq.valid.poke(true.B)
+      c.io.coalReq.bits.source.poke(sourceId)
+      c.io.coalReq.ready.expect(true.B)
 
-  //     c.io.coalReqValid.poke(false.B)
+      c.clock.step()
 
-  //     c.clock.step()
+      c.io.coalReq.valid.poke(false.B)
+      c.io.invalidate.valid.poke(false.B)
 
-  //     c.io.coalResp.valid.poke(true.B)
-  //     c.io.coalResp.bits.source.poke(sourceId)
-  //     val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
-  //     c.io.coalResp.bits.data.poke(lit.U)
+      c.clock.step()
 
-  //     // table lookup is combinational at the same cycle
-  //     // offset is counting from LSB
-  //     c.io.uncoalResps(0)(0).valid.expect(true.B)
-  //     c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
-  //     c.io.uncoalResps(0)(0).bits.source.expect(0.U)
-  //     c.io.uncoalResps(0)(1).valid.expect(false.B)
-  //     c.io.uncoalResps(1)(0).valid.expect(true.B)
-  //     c.io.uncoalResps(1)(0).bits.data.expect(0x5ca1ab1eL.U)
-  //     c.io.uncoalResps(1)(0).bits.source.expect(1.U)
-  //     c.io.uncoalResps(1)(1).valid.expect(false.B)
-  //     c.io.uncoalResps(2)(0).valid.expect(true.B)
-  //     c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U)
-  //     c.io.uncoalResps(2)(0).bits.source.expect(2.U)
-  //     c.io.uncoalResps(2)(1).valid.expect(false.B)
-  //     c.io.uncoalResps(3)(0).valid.expect(true.B)
-  //     c.io.uncoalResps(3)(0).bits.data.expect(0x5ca1ab1eL.U)
-  //     c.io.uncoalResps(3)(0).bits.source.expect(3.U)
-  //     c.io.uncoalResps(3)(1).valid.expect(false.B)
-  //   }
-  // }
+      c.io.coalResp.valid.poke(true.B)
+      c.io.coalResp.bits.source.poke(sourceId)
+      val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
+      c.io.coalResp.bits.data.poke(lit.U)
+
+      // table lookup is combinational at the same cycle
+      // offset is counting from LSB
+      c.io.uncoalResps(0)(0).valid.expect(true.B)
+      c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(0)(0).bits.source.expect(0.U)
+      c.io.uncoalResps(0)(1).valid.expect(false.B)
+      c.io.uncoalResps(1)(0).valid.expect(true.B)
+      c.io.uncoalResps(1)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(1)(0).bits.source.expect(1.U)
+      c.io.uncoalResps(1)(1).valid.expect(false.B)
+      c.io.uncoalResps(2)(0).valid.expect(true.B)
+      c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(2)(0).bits.source.expect(2.U)
+      c.io.uncoalResps(2)(1).valid.expect(false.B)
+      c.io.uncoalResps(3)(0).valid.expect(true.B)
+      c.io.uncoalResps(3)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(3)(0).bits.source.expect(3.U)
+      c.io.uncoalResps(3)(1).valid.expect(false.B)
+    }
+  }
 }
 
 class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester {

From 9b7080a852788a71938c3ddad478c7df00a939ab Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 18:50:47 -0700
Subject: [PATCH 10/10] Delete old inflight table unittest

---
 .../scala/coalescing/CoalescingUnitTest.scala | 135 ------------------
 1 file changed, 135 deletions(-)

diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala
index 80342a1..546abad 100644
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -863,138 +863,3 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
     }
   }
 }
-
-class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester {
-  behavior of "inflight coalesced request table"
-  val numLanes = 4
-  val numPerLaneReqs = 2
-  val sourceWidth = 2
-  val entries = 4
-
-  val offsetBits = 4
-  val sizeBits = 2
-
-  val inflightCoalReqTableEntry =
-    new InflightCoalReqTableEntry(
-      numLanes,
-      numPerLaneReqs,
-      sourceWidth,
-      offsetBits,
-      testConfig.sizeEnum
-    )
-
-  // it should "stop enqueueing when full" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries)) { c =>
-  //     // fill up the table
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(sourceId.U)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //       c.io.lookup.ready.poke(false.B)
-  //       c.clock.step()
-  //     }
-
-  //     // now cannot enqueue any more
-  //     c.io.enq.ready.expect(false.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(0.U)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //     c.clock.step()
-  //     c.io.enq.ready.expect(false.B)
-
-  //     // try to lookup all existing entries
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.valid.poke(false.B)
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(sourceId)
-  //       c.io.lookup.valid.expect(true.B)
-  //       c.io.lookup.bits.expect(sourceId)
-  //       c.clock.step()
-  //     }
-
-  //     // now the table should be empty
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.valid.poke(false.B)
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(sourceId)
-  //       c.io.lookup.valid.expect(false.B)
-  //       c.clock.step()
-  //     }
-  //   }
-  // }
-  // it should "lookup matching entry" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries))
-  //     .withAnnotations(Seq(WriteVcdAnnotation)) { c =>
-  //       c.reset.poke(true.B)
-  //       c.clock.step(10)
-  //       c.reset.poke(false.B)
-
-  //       // enqueue one entry to not match at 0th index
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(0.U)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //       c.clock.step()
-
-  //       val targetSourceId = 1.U
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(targetSourceId)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //       c.clock.step()
-
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(targetSourceId)
-  //       c.io.lookup.valid.expect(true.B)
-  //       c.io.lookup.bits.expect(targetSourceId)
-
-  //       c.clock.step()
-
-  //       // test if matching entry dequeues after 1 cycle
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(targetSourceId)
-  //       c.io.lookup.valid.expect(false.B)
-  //     }
-  // }
-  // it should "handle lookup and enqueue at the same time" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries)) { c =>
-  //     // fill up the table
-  //     val targetSourceId = 1.U
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(0.U)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.clock.step()
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(targetSourceId)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.clock.step()
-
-  //     // do both enqueue and lookup at the same cycle
-  //     val enqSourceId = 2.U
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(enqSourceId)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.io.lookup.ready.poke(true.B)
-  //     c.io.lookupSourceId.poke(targetSourceId)
-
-  //     c.clock.step()
-  //   }
-  // }
-}