diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index d63e9cf..f309b9f 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -4,7 +4,6 @@ package freechips.rocketchip.tilelink
 
 import chisel3._
 import chisel3.util._
-import chisel3.experimental.ChiselEnum
 import org.chipsalliance.cde.config.{Parameters, Field}
 import freechips.rocketchip.diplomacy._
 // import freechips.rocketchip.devices.tilelink.TLTestRAM
@@ -42,6 +41,13 @@ object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum {
   }
 }
 
+// Mapping to reference model param names
+//  numLanes: Int, <-> config.NUM_LANES
+//  numPerLaneReqs: Int, <-> config.DEPTH
+//  sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)
+//  sizeWidth: Int, <-> config.sizeEnum.width
+//  coalDataWidth: Int, <-> (1 << config.MAX_SIZE)
+//  numInflightCoalRequests: Int <-> config.NUM_NEW_IDS
 case class CoalescerConfig(
   enable: Boolean,        // globally enable or disable coalescing
   numLanes: Int,          // number of lanes (or threads) in a warp
@@ -135,7 +141,11 @@ class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends La
   lazy val module = new CoalescingUnitImp(this, config)
 }
 
-class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int) extends Bundle {
+// Protocol-agnostic bundles that represent a request and a response to the
+// coalescer.
+
+class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int)
+    extends Bundle {
   require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
   val op = UInt(1.W) // 0=READ 1=WRITE
   val address = UInt(addressWidth.W)
@@ -149,7 +159,7 @@ class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWid
       fromSource = this.source,
       toAddress = this.address,
       lgSize = this.size,
-      data = this.data,
+      data = this.data
     )
     val (glegal, gbits) = edgeOut.Get(
       fromSource = this.source,
@@ -162,8 +172,24 @@ class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWid
     bits
   }
 }
+case class NonCoalescedRequest(config: CoalescerConfig)
+    extends Request(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      addressWidth = config.addressWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
+case class CoalescedRequest(config: CoalescerConfig)
+    extends Request(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      addressWidth = config.addressWidth,
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )
 
-class RespQueueEntry(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle {
+class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int)
+    extends Bundle {
+  require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
   val op = UInt(1.W) // 0=READ 1=WRITE
   val size = UInt(sizeWidth.W)
   val source = UInt(sourceWidth.W)
@@ -191,10 +217,23 @@ class RespQueueEntry(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends B
     this.error := bundle.denied
   }
 }
+case class NonCoalescedResponse(config: CoalescerConfig)
+    extends Response(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
+case class CoalescedResponse(config: CoalescerConfig)
+    extends Response(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )
 
 // If `ignoreInUse`, just keep giving out new IDs without checking if it is in
 // use.
-class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module {
+class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true)
+    extends Module {
   val io = IO(new Bundle {
     val gen = Input(Bool())
     val reclaim = Input(Valid(UInt(sourceWidth.W)))
@@ -213,15 +252,16 @@ class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) e
 
   io.id.valid := (if (ignoreInUse) true.B else !occupancyTable(head).valid)
   io.id.bits := head
-  when (io.gen && io.id.valid /* fire */) {
+  when(io.gen && io.id.valid /* fire */ ) {
     occupancyTable(io.id.bits).valid := true.B // mark in use
   }
-  when (io.reclaim.valid) {
+  when(io.reclaim.valid) {
     occupancyTable(io.reclaim.bits).valid := false.B // mark freed
   }
 }
 
-class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) extends Module {
+class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
+    extends Module {
   val io = IO(new Bundle {
     val queue = new Bundle {
       val enq = Vec(config.numLanes, DeqIO(gen.cloneType))
@@ -238,7 +278,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 //  eltPrototype.valid := false.B
 
   val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen))))
-  val writePtr = RegInit(VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))))
+  val writePtr = RegInit(
+    VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W)))
+  )
   val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B)))
 
   private def resetElts = {
@@ -249,7 +291,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
       }
     }
   }
-  when (reset.asBool) {
+  when(reset.asBool) {
     resetElts
   }
 
@@ -265,14 +307,17 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
   // current cycle.
   //
   // shift hint is when the heads have no more coalescable left this or next cycle
-  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) =>
-    c && !(io.invalidate.valid && inv)
-  }.reduce(_ || _)
+  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0)))
+    .map { case (c, inv) =>
+      c && !(io.invalidate.valid && inv)
+    }
+    .reduce(_ || _)
   val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
   // valid && !fire means we enable enqueueing to a full queue, provided the
   // arbiter is taking away all remaining valid queue heads in the next cycle so
   // that we make space for the entire next warp.
-  val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
+  val syncedDeqValidNextCycle =
+    io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
 
   for (i <- 0 until config.numLanes) {
     val enq = io.queue.enq(i)
@@ -292,20 +337,22 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
     // can take new entries if not empty, or if full but shifting
     enq.ready := (!ctrl.full) || ctrl.shift
 
-    when (ctrl.shift) {
+    when(ctrl.shift) {
       // shift, invalidate tail, invalidate coalesced requests
       elts(i).zipWithIndex.foreach { case (elt, j) =>
         if (j == entries - 1) { // tail
           elt.valid := false.B
         } else {
           elt.bits := elts(i)(j + 1).bits
-          elt.valid := elts(i)(j + 1).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
+          elt.valid := elts(i)(
+            j + 1
+          ).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
         }
       }
       // reset dequeue mask when new entries are shifted in
       deqDone(i) := false.B
       // enqueue
-      when (enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
+      when(enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
         elts(i)(writePtr(i) - 1.U).bits := enq.bits
         elts(i)(writePtr(i) - 1.U).valid := enq.valid
       }.otherwise {
@@ -313,13 +360,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
       }
     }.otherwise {
       // invalidate coalesced requests
-      when (io.invalidate.valid) {
+      when(io.invalidate.valid) {
         (elts(i) zip io.invalidate.bits(i).asBools).map { case (elt, inv) =>
           elt.valid := elt.valid && !inv
         }
       }
       // enqueue
-      when (enq.ready && syncedEnqValid) {
+      when(enq.ready && syncedEnqValid) {
         elts(i)(writePtr(i)).bits := enq.bits
         elts(i)(writePtr(i)).valid := enq.valid
         writePtr(i) := writePtr(i) + 1.U
@@ -331,8 +378,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
   // When doing spatial-only coalescing, queues should never drift from each
   // other, i.e. the queue heads should always contain mem requests from the
   // same instruction.
-  val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
-    writePtr.map(_ === writePtr.head).reduce(_ && _)
+  val queueInSync =
+    controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
+      writePtr.map(_ === writePtr.head).reduce(_ && _)
   assert(queueInSync, "shift queue lanes are not in sync")
 
   io.mask := elts.map(x => VecInit(x.map(_.valid)).asUInt)
@@ -340,18 +388,23 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 }
 
 // Software model: coalescer.py
-class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
-                    config: CoalescerConfig) extends Module {
+class MonoCoalescer(
+    config: CoalescerConfig,
+    coalLogSize: Int,
+    queueT: CoalShiftQueue[NonCoalescedRequest]
+) extends Module {
   val io = IO(new Bundle {
-    val window = Input(windowT.io.cloneType)
+    val window = Input(queueT.io.cloneType)
     val results = Output(new Bundle {
       val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W))
       val baseAddr = Output(UInt(config.addressWidth.W))
       val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
       // number of entries matched with this leader lane's head.
       // maximum is numLanes * queueDepth
-      val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
-      val coverageHits = Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
+      val matchCount =
+        Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
+      val coverageHits =
+        Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
       val canCoalesce = Output(Vec(config.numLanes, Bool()))
     })
   })
@@ -365,9 +418,13 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
   val leadersValid = io.window.mask.map(_.asBools.head)
 
   def printQueueHeads = {
-    leaders.zipWithIndex.foreach{ case (head, i) =>
-      printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
-        leadersValid(i), head.source, head.address)
+    leaders.zipWithIndex.foreach { case (head, i) =>
+      printf(
+        s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
+        leadersValid(i),
+        head.source,
+        head.address
+      )
     }
   }
   // when (leadersValid.reduce(_ || _)) {
@@ -376,7 +433,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
 
   val size = coalLogSize
   val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
-  def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = {
+  def canMatch(req0: Request, req0v: Bool, req1: Request, req1v: Bool): Bool = {
     (req0.op === req1.op) &&
     (req0v && req1v) &&
     ((req0.address & this.addrMask) === (req1.address & this.addrMask))
@@ -385,34 +442,42 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
   // Gives a 2-D table of Bools representing match at every queue entry,
   // for each lane (so 3-D in total).
   // dimensions: (leader lane, follower lane, follower entry)
-  val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) =>
-    (io.window.elts zip io.window.mask).map { case (followers, followerValids) =>
-      // compare leader's head against follower's every queue entry
-      (followers zip followerValids.asBools).map { case (follower, followerValid) =>
-        canMatch(follower, followerValid, leader, leaderValid)
-        // FIXME: disabling halving optimization because it does not give the
-        // correct per-lane coalescable indication to the shift queue
-          // // match leader to only followers at lanes >= leader idx
-          // // this halves the number of comparators
-          // if (followerIndex < leaderIndex) false.B
-          // else canMatch(follower, followerValid, leader, leaderValid)
+  val matchTablePerLane = (leaders zip leadersValid).map {
+    case (leader, leaderValid) =>
+      (io.window.elts zip io.window.mask).map {
+        case (followers, followerValids) =>
+          // compare leader's head against follower's every queue entry
+          (followers zip followerValids.asBools).map {
+            case (follower, followerValid) =>
+              canMatch(follower, followerValid, leader, leaderValid)
+            // FIXME: disabling halving optimization because it does not give the
+            // correct per-lane coalescable indication to the shift queue
+            // // match leader to only followers at lanes >= leader idx
+            // // this halves the number of comparators
+            // if (followerIndex < leaderIndex) false.B
+            // else canMatch(follower, followerValid, leader, leaderValid)
+          }
       }
-    }
   }
 
   val matchCounts = matchTablePerLane.map(table =>
-      table.map(PopCount(_)) // sum up each column
-           .reduce(_ +& _))
+    table
+      .map(PopCount(_)) // sum up each column
+      .reduce(_ +& _)
+  )
   val canCoalesce = matchCounts.map(_ > 1.U)
 
   // Elect the leader that has the most match counts.
   // TODO: potentially expensive: magnitude comparator
   def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
-    matchCounts.zipWithIndex.map {
-      case (c, i) => (c, i.U)
-    }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
+    matchCounts.zipWithIndex
+      .map { case (c, i) =>
+        (c, i.U)
+      }
+      .reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
         (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
-    }._2
+      }
+      ._2
   }
   // Elect leader by choosing the smallest-index lane that has a valid
   // match, i.e. using priority encoder.
@@ -422,8 +487,9 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
   val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts)
 
   val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux
-  // matchTable for the chosen lane, but converted to a Vec[UInt]
-  val chosenMatches = VecInit(matchTablePerLane.map{ table =>
+  // matchTable for the chosen lane, but each column converted to bitflags,
+  // i.e. Vec[UInt]
+  val chosenMatches = VecInit(matchTablePerLane.map { table =>
     VecInit(table.map(VecInit(_).asUInt))
   })(chosenLeaderIdx)
   val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)
@@ -431,18 +497,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
   // coverage calculation
   def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordSizeWidth)
   // 2-D table flattened to 1-D
-  val offsets = io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
+  val offsets =
+    io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
   val valids = chosenMatches.flatMap(_.asBools)
   // indicates for each word in the coalesced chunk whether it is accessed by
   // any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four
   // words in the coalesced data coming back will be accessed by some request
   // and we've reached 100% bandwidth utilization.
   val hits = Seq.tabulate(1 << (size - config.wordSizeWidth)) { target =>
-    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+    (offsets zip valids)
+      .map { case (offset, valid) => valid && (offset === target.U) }
+      .reduce(_ || _)
   }
 
   // debug prints
-  when (leadersValid.reduce(_ || _)) {
+  when(leadersValid.reduce(_ || _)) {
     matchCounts.zipWithIndex.foreach { case (count, i) =>
       printf(s"lane[${i}] matchCount = %d\n", count);
     }
@@ -471,20 +540,28 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
 // coalesced request out of all possible combinations.
 //
 // Software model: coalescer.py
-class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry,
-                     config: CoalescerConfig) extends Module {
+class MultiCoalescer(
+    config: CoalescerConfig,
+    queueT: CoalShiftQueue[NonCoalescedRequest],
+    coalReqT: Request,
+) extends Module {
+  val invalidateT = Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))
   val io = IO(new Bundle {
     // coalescing window, connected to the contents of the request queues
-    val window = Input(windowT.io.cloneType)
+    val window = Input(queueT.io.cloneType)
     // generated coalesced request
     val coalReq = DecoupledIO(coalReqT.cloneType)
-    // invalidate signals going into each request queue's head
-    val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
-    // whether a lane is coalescable
+    // invalidate signals going into each request queue's head.  Lanes with
+    // high invalidate bits are what became coalesced into the new request.
+    val invalidate = Output(invalidateT)
+    // whether a lane is coalescable.  This is used to output non-coalescable
+    // lanes to the arbiter so they can be flushed to downstream.
     val coalescable = Output(Vec(config.numLanes, Bool()))
   })
 
-  val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
+  val coalescers = config.coalLogSizes.map(size =>
+    Module(new MonoCoalescer(config, size, queueT))
+  )
   coalescers.foreach(_.io.window := io.window)
 
   def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
@@ -509,9 +586,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
   val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
   val chosenValid = Wire(Bool())
   // minimum 25% coverage
-  val minCoverage = 1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
+  val minCoverage =
+    1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
 
-  when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
+  when(normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
     chosenSizeIdx := argMax(normalizedHits)
     chosenValid := true.B
     printf("coalescing success by coverage policy\n")
@@ -541,9 +619,14 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
   val flatMatches = chosenBundle.matchOH.flatMap(_.asBools)
 
   // check for word alignment in addresses
-  assert(io.window.elts.flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U)).zip(
-    io.window.mask.flatMap(_.asBools)).map { case (aligned, valid) => (!valid) || aligned }.reduce(_ || _),
-    "one or more addresses used for coalescing is not word-aligned")
+  assert(
+    io.window.elts
+      .flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U))
+      .zip(io.window.mask.flatMap(_.asBools))
+      .map { case (aligned, valid) => (!valid) || aligned }
+      .reduce(_ || _),
+    "one or more addresses used for coalescing is not word-aligned"
+  )
 
   // note: this is word-level coalescing. if finer granularity is needed, need to modify code
   val numWords = (1.U << (chosenSize - config.wordSizeWidth.U)).asUInt
@@ -558,18 +641,29 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
     val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
       // note: ANDing against addrMask is to conform to active byte lanes requirements
       // if aligning to LSB suffices, we should add the bitwise AND back
-      m && ((req.address(config.maxCoalLogSize - 1, config.wordSizeWidth)/* & addrMask*/) === i.U)
+      m && ((req.address(
+        config.maxCoalLogSize - 1,
+        config.wordSizeWidth
+      ) /* & addrMask*/ ) === i.U)
     }
     // TODO: SW uses priority encoder, not sure about behavior of MuxCase
-    data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) =>
-      s -> req.data
-    })
-    mask(i) := MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) =>
-      s -> req.mask
-    })
+    data(i) := MuxCase(
+      DontCare,
+      flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.data
+      }
+    )
+    mask(i) := MuxCase(
+      0.U,
+      flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.mask
+      }
+    )
   }
 
-  val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds)))
+  val sourceGen = Module(
+    new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds))
+  )
   sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created
   sourceGen.io.reclaim.valid := false.B // not used
   sourceGen.io.reclaim.bits := DontCare // not used
@@ -587,7 +681,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
   io.invalidate.bits := chosenBundle.matchOH
   io.invalidate.valid := io.coalReq.fire // invalidate only when fire
 
-  io.coalescable := coalescers.map(_.io.results.canCoalesce.asUInt).reduce(_ | _).asBools
+  io.coalescable := coalescers
+    .map(_.io.results.canCoalesce.asUInt)
+    .reduce(_ | _)
+    .asBools
 
   dontTouch(io.invalidate) // debug
 
@@ -599,26 +696,36 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
   if (!config.enable) disable
 }
 
-class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) {
-  require(outer.cpuNode.in.length == config.numLanes,
+class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
+    extends LazyModuleImp(outer) {
+  require(
+    outer.cpuNode.in.length == config.numLanes,
     s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " +
-    s"config.numLanes (${config.numLanes})")
-  require(outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
+      s"config.numLanes (${config.numLanes})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
     s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " +
-    s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})")
-  require(outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
+      s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
     s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
-    s"mismatch with config.addressWidth (${config.addressWidth})")
+      s"mismatch with config.addressWidth (${config.addressWidth})"
+  )
+  require(
+    config.maxCoalLogSize <= config.dataBusWidth,
+    "multi-beat coalesced reads/writes are currently not supported"
+  )
 
-  val sourceWidth = outer.cpuNode.in.head._1.params.sourceBits
-  // note we are using word size. assuming all coalescer inputs are word sized
-  val reqQueueEntryT = new ReqQueueEntry(sourceWidth, config.wordSizeWidth,
-    config.addressWidth, (config.wordSizeInBytes * 8))
-  val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config))
+  val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
+  val nonCoalReqT = new NonCoalescedRequest(config)
+  val reqQueues = Module(
+    new CoalShiftQueue(nonCoalReqT, config.queueDepth, config)
+  )
 
-  val coalReqT = new ReqQueueEntry(log2Ceil(config.numNewSrcIds), log2Ceil(config.maxCoalLogSize),
-    config.addressWidth, (1 << config.maxCoalLogSize) * 8)
-  val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
+  val coalReqT = new CoalescedRequest(config)
+  val coalescer = Module(new MultiCoalescer(config, reqQueues, coalReqT))
   coalescer.io.window := reqQueues.io
   reqQueues.io.coalescable := coalescer.io.coalescable
   reqQueues.io.invalidate := coalescer.io.invalidate
@@ -634,7 +741,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
     case (((tlIn, _), (tlOut, edgeOut)), lane) =>
       // Request queue
-      val req = Wire(reqQueueEntryT)
+      val req = Wire(nonCoalReqT)
 
       req.op := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
       req.source := tlIn.a.bits.source
@@ -691,6 +798,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
   tlCoal.e.valid := false.B
 
+  require(
+    tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
+    s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
+      + s" (${(1 << config.dataBusWidth) * 8})"
+  )
 
   // ===========================================================================
   // Response flow
@@ -703,8 +815,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   // coalesced request.  Upper bound is min(DEPTH, 2**sourceWidth).
   val numPerLaneReqs = config.queueDepth
 
-  val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize),
-    (1 << config.maxCoalLogSize) * 8)
+  // FIXME: no need to contain maxCoalLogSize data
+  val respQueueEntryT = new Response(
+    oldSourceWidth,
+    log2Ceil(config.maxCoalLogSize),
+    (1 << config.maxCoalLogSize) * 8
+  )
   val respQueues = Seq.tabulate(config.numLanes) { _ =>
     Module(
       new MultiPortQueue(
@@ -776,72 +892,41 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
       dontTouch(tlOut.d)
   }
 
-  // Construct new entry for the inflight table
-  // FIXME: don't instantiate inflight table entry type here.  It leaks the table's impl
-  // detail to the coalescer
-
-  // richard: I think a good idea is to pass Valid[ReqQueueEntry] generated by
-  // the coalescer directly into the uncoalescer, so that we can offload the
-  // logic to generate the Inflight Entry into the uncoalescer, where it should be.
-  // this also reduces top level clutter.
-
-  val uncoalescer = Module(new Uncoalescer(config))
-
-  val newEntry = Wire(uncoalescer.inflightTable.entryT)
-  newEntry.source := coalescer.io.coalReq.bits.source
-
-  assert (config.maxCoalLogSize <= config.dataBusWidth,
-    "multi-beat coalesced reads/writes are currently not supported")
-  assert (
-    tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
-    s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
-    + s" (${(1 << config.dataBusWidth) * 8})"
+  val uncoalescer = Module(
+    new Uncoalescer(config, nonCoalReqT, coalReqT)
   )
+  // connect coalesced request that is newly generated and being recorded in
+  // the uncoalescer
+  uncoalescer.io.coalReq <> coalescer.io.coalReq
+  uncoalescer.io.invalidate := coalescer.io.invalidate
   val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
-  // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
-  // coalescer to every (numLanes * queueDepth) entry in the inflight table.
-  (newEntry.lanes zip coalescer.io.invalidate.bits).zipWithIndex
-    .foreach { case ((laneEntry, laneInv), lane) =>
-      (laneEntry.reqs zip laneInv.asBools).zipWithIndex
-        .foreach { case ((reqEntry, inv), i) =>
-          val req = reqQueues.io.elts(lane)(i)
-          when ((coalescer.io.invalidate.valid && inv)) {
-            printf(s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", req.source)
-          }
-          reqEntry.valid := (coalescer.io.invalidate.valid && inv)
-          reqEntry.source := req.source
-          reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordSizeWidth)
-          reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
-          // TODO: load/store op
-        }
-    }
-  dontTouch(newEntry)
-
-  uncoalescer.io.coalReqValid := coalescer.io.coalReq.valid
-  uncoalescer.io.newEntry := newEntry
+  uncoalescer.io.windowElts := reqQueues.io.elts
+  // connect coalesced response going into the uncoalescer, ready to be
+  // uncoalesced
   // Cleanup: custom <>?
   uncoalescer.io.coalResp.valid := tlCoal.d.valid
-  uncoalescer.io.coalResp.bits.source := tlCoal.d.bits.source
-  uncoalescer.io.coalResp.bits.data := tlCoal.d.bits.data
+  uncoalescer.io.coalResp.bits.fromTLD(tlCoal.d.bits)
+  // uncoalescer backpressure
   tlCoal.d.ready := uncoalescer.io.coalResp.ready
 
   // Connect uncoalescer results back into each lane's response queue
-  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) =>
-    perLaneResps.zipWithIndex.foreach { case (resp, i) =>
-      // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
-      // cache.  This should ideally not happen though.
-      assert(
-        q.io.enq(respQueueUncoalPortOffset + i).ready,
-        s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
-      )
-      q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
-      q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
-      // debug
-      // when (resp.valid) {
-      //   printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
-      // }
-      // dontTouch(q.io.enq(respQueueCoalPortOffset))
-    }
+  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach {
+    case ((q, perLaneResps), lane) =>
+      perLaneResps.zipWithIndex.foreach { case (resp, i) =>
+        // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
+        // cache.  This should ideally not happen though.
+        assert(
+          q.io.enq(respQueueUncoalPortOffset + i).ready,
+          s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
+        )
+        q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
+        q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
+        // debug
+        // when (resp.valid) {
+        //   printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
+        // }
+        // dontTouch(q.io.enq(respQueueCoalPortOffset))
+      }
   }
 
   // Debug
@@ -853,67 +938,78 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   dontTouch(tlCoal.d)
 }
 
-// Protocol-agnostic bundle that represents a coalesced response.
-//
-// Having this makes it easier to:
-//   * do unit tests -- no need to deal with TileLink in the chiseltest code
-//   * adapt coalescer to custom protocols like a custom L1 cache interface.
-//
-// FIXME: overlaps with RespQueueEntry. Trait-ify
-class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle {
-  val source = UInt(log2Ceil(config.numNewSrcIds).W)
-  val data = UInt((8 * (1 << config.maxCoalLogSize)).W)
-
-  def fromTLD(bundle:TLBundleD): Unit = {
-    this.source := bundle.source
-    this.data   := bundle.data
-  }
-
-}
-
-class Uncoalescer(config: CoalescerConfig) extends Module {
-  // notes to hansung:
-  //  val numLanes: Int, <-> config.NUM_LANES
-  //  val numPerLaneReqs: Int, <-> config.DEPTH
-  //  val sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)
-  //  val sizeWidth: Int, <-> config.sizeEnum.width
-  //  val coalDataWidth: Int, <-> (1 << config.MAX_SIZE)
-  //  val numInflightCoalRequests: Int <-> config.NUM_NEW_IDS
+class Uncoalescer(
+    config: CoalescerConfig,
+    nonCoalReqT: NonCoalescedRequest,
+    coalReqT: CoalescedRequest,
+) extends Module {
   val inflightTable = Module(new InflightCoalReqTable(config))
   val io = IO(new Bundle {
-    val coalReqValid = Input(Bool())
-    // FIXME: receive ReqQueueEntry and construct newEntry inside uncoalescer
-    val newEntry = Input(inflightTable.entryT.cloneType)
-    val coalResp = Flipped(Decoupled(new CoalescedResponseBundle(config)))
+    // generated coalesced request, connected to the output of the coalescer.
+    val coalReq = Flipped(DecoupledIO(coalReqT.cloneType))
+    // invalidate signal coming out of coalescer.
+    val invalidate = Input(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
+    // coalescing window, connected to the contents of the request queues.
+    // Uncoalescer looks at the queue entries that got coalesced into `coalReq`
+    // in order to record which lanes this coalReq originally came from.
+    // We only care about window.elts because the coalescer would have made
+    // sure it only looked at the valid entries.
+    // TODO: duplicate type construction
+    val windowElts = Input(Vec(config.numLanes, Vec(config.queueDepth, nonCoalReqT)))
+    val coalResp = Flipped(Decoupled(new CoalescedResponse(config)))
     val uncoalResps = Output(
       Vec(
         config.numLanes,
-        Vec(
-          config.queueDepth,
-          ValidIO(
-            new RespQueueEntry(log2Ceil(config.numOldSrcIds), config.wordSizeWidth,
-              config.wordSizeInBytes * 8)
-          )
-        )
+        Vec(config.queueDepth, ValidIO(new NonCoalescedResponse(config)))
       )
     )
   })
 
-  // Populate inflight table
-  inflightTable.io.enq.valid := io.coalReqValid
-  inflightTable.io.enq.bits := io.newEntry
+  // Uncoalescer has to be always ready to accept and record new coalesced
+  // requests, so that it doesn't stall the coalescer.
+  io.coalReq.ready := true.B
+
+  // Construct a new entry for the inflight table using generated coalesced request
+  def generateInflightTableEntry: InflightCoalReqTableEntry = {
+    val newEntry = Wire(inflightTable.entryT)
+    newEntry.source := io.coalReq.bits.source
+    // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
+    // coalescer to every (numLanes * queueDepth) entry in the inflight table.
+    (newEntry.lanes zip io.invalidate.bits).zipWithIndex
+      .foreach { case ((laneEntry, laneInv), lane) =>
+        (laneEntry.reqs zip laneInv.asBools).zipWithIndex
+          .foreach { case ((reqEntry, inv), i) =>
+            val req = io.windowElts(lane)(i)
+            when((io.invalidate.valid && inv)) {
+              printf(
+                s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
+                req.source
+              )
+            }
+            reqEntry.valid := (io.invalidate.valid && inv)
+            reqEntry.source := req.source
+            reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordSizeWidth)
+            reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
+            // TODO: load/store op
+          }
+      }
+    assert(
+      !((io.coalReq.valid === true.B) && (io.coalResp.valid === true.B) &&
+        (newEntry.source === io.coalResp.bits.source)),
+      "inflight table: enqueueing and looking up the same srcId at the same cycle is not handled"
+    )
+    dontTouch(newEntry)
+
+    newEntry
+  }
+  inflightTable.io.enq.valid := io.coalReq.valid
+  inflightTable.io.enq.bits := generateInflightTableEntry
 
   // Look up the table with incoming coalesced responses
   inflightTable.io.lookup.ready := io.coalResp.valid
   inflightTable.io.lookupSourceId := io.coalResp.bits.source
   io.coalResp.ready := true.B // FIXME, see sw model implementation
 
-  assert(
-    !((io.coalReqValid === true.B) && (io.coalResp.valid === true.B) &&
-      (io.newEntry.source === io.coalResp.bits.source)),
-    "inflight table: enqueueing and looking up the same srcId at the same cycle is not handled"
-  )
-
   // Un-coalescing logic
   //
   def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, logSize: UInt): UInt = {
@@ -972,7 +1068,8 @@ class Uncoalescer(config: CoalescerConfig) extends Module {
 // split the coalesced response back to individual per-lane responses with the
 // right metadata.
 class InflightCoalReqTable(config: CoalescerConfig) extends Module {
-  val offsetBits = config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
+  val offsetBits =
+    config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
   val entryT = new InflightCoalReqTableEntry(
     config.numLanes,
     config.queueDepth,
@@ -1019,7 +1116,7 @@ class InflightCoalReqTable(config: CoalescerConfig) extends Module {
   }
 
   val full = Wire(Bool())
-  full := (0 until entries).map( table(_).valid ).reduce( _ && _ )
+  full := (0 until entries).map(table(_).valid).reduce(_ && _)
   assert(!full, "inflight table is full and blocking coalescer")
   dontTouch(full)
 
@@ -1094,8 +1191,12 @@ object TLUtils {
 // `traceHasSource` is true if the input trace file has an additional source
 // ID column.  This is useful for using the output trace file genereated by
 // MemTraceLogger as the driver.
-class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource: Boolean = false)
-  (implicit p: Parameters) extends LazyModule {
+class MemTraceDriver(
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean = false
+)(implicit p: Parameters)
+    extends LazyModule {
   // Create N client nodes together
   val laneNodes = Seq.tabulate(config.numLanes) { i =>
     val clientParam = Seq(
@@ -1113,7 +1214,8 @@ class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource:
   val node = TLIdentityNode()
   laneNodes.foreach { l => node := l }
 
-  lazy val module = new MemTraceDriverImp(this, config, filename, traceHasSource)
+  lazy val module =
+    new MemTraceDriverImp(this, config, filename, traceHasSource)
 }
 
 trait HasTraceLine {
@@ -1136,9 +1238,12 @@ class TraceLine extends Bundle with HasTraceLine {
   val data = UInt(64.W)
 }
 
-class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename: String,
-  traceHasSource: Boolean)
-    extends LazyModuleImp(outer)
+class MemTraceDriverImp(
+    outer: MemTraceDriver,
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean
+) extends LazyModuleImp(outer)
     with UnitTestModule {
   // Current cycle mark to read from trace
   val traceReadCycle = RegInit(1.U(64.W))
@@ -1176,7 +1281,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
 
   // Not all fire because trace cycle has to advance even when there is no valid
   // line in the trace.
-  when (reqQueueAllReady){
+  when(reqQueueAllReady) {
     traceReadCycle := traceReadCycle + 1.U
   }
 
@@ -1216,11 +1321,16 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
     sizeInBytes := (1.U) << req.size
     mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
     wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
-    val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
+    val wordAlignedAddress =
+      req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
     val wordAlignedSize = Mux(subword, 2.U, req.size)
 
-    val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numOldSrcIds),
-      ignoreInUse = false))
+    val sourceGen = Module(
+      new RoundRobinSourceGenerator(
+        log2Ceil(config.numOldSrcIds),
+        ignoreInUse = false
+      )
+    )
     sourceGen.io.gen := reqQ.io.deq.fire
     // assert(sourceGen.io.id.valid)
 
@@ -1229,7 +1339,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
       toAddress = hashToValidPhyAddr(wordAlignedAddress),
       lgSize = wordAlignedSize, // trace line already holds log2(size)
       // data should be aligned to beatBytes
-      data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
+      data =
+        (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
     )
     val (glegal, gbits) = edge.Get(
       fromSource = sourceGen.io.id.bits,
@@ -1240,7 +1351,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
     val bits = Mux(req.is_store, pbits, gbits)
 
     tlOut.a.valid := (reqQ.io.deq.valid && sourceGen.io.id.valid)
-    when (tlOut.a.valid) {
+    when(tlOut.a.valid) {
       assert(legal, "illegal TL req gen")
     }
     tlOut.a.bits := bits
@@ -1288,9 +1399,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
 
 class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
     extends BlackBox(
-      Map("FILENAME" -> filename,
-          "NUM_LANES" -> numLanes,
-          "HAS_SOURCE" -> (if (traceHasSource) 1 else 0))
+      Map(
+        "FILENAME" -> filename,
+        "NUM_LANES" -> numLanes,
+        "HAS_SOURCE" -> (if (traceHasSource) 1 else 0)
+      )
     )
     with HasBlackBoxResource {
   val traceLineT = new TraceLine
@@ -1304,19 +1417,20 @@ class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
 
     // These names have to match declarations in the Verilog code, eg.
     // trace_read_address.
-    val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source
-      val ready = Input(Bool())
-      val valid = Output(UInt(numLanes.W))
-      // Chisel can't interface with Verilog 2D port, so flatten all lanes into
-      // single wide 1D array.
-      // TODO: assumes 64-bit address.
-      val cycle = Input(UInt(64.W))
-      val address = Output(UInt((addrW * numLanes).W))
-      val is_store = Output(UInt(numLanes.W))
-      val size = Output(UInt((sizeW * numLanes).W))
-      val data = Output(UInt((dataW * numLanes).W))
-      val finished = Output(Bool())
-    }
+    val trace_read =
+      new Bundle { // can't use HasTraceLine because this doesn't have source
+        val ready = Input(Bool())
+        val valid = Output(UInt(numLanes.W))
+        // Chisel can't interface with Verilog 2D port, so flatten all lanes into
+        // single wide 1D array.
+        // TODO: assumes 64-bit address.
+        val cycle = Input(UInt(64.W))
+        val address = Output(UInt((addrW * numLanes).W))
+        val is_store = Output(UInt(numLanes.W))
+        val size = Output(UInt((sizeW * numLanes).W))
+        val data = Output(UInt((dataW * numLanes).W))
+        val finished = Output(Bool())
+      }
   })
 
   addResource("/vsrc/SimMemTrace.v")
@@ -1443,11 +1557,11 @@ class MemTraceLogger(
 
         // This assert only holds true for PutFullData and not PutPartialData,
         // where HIGH bits in the mask may not be contiguous.
-        when (tlIn.a.valid) {
+        when(tlIn.a.valid) {
           assert(
             PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
             "mask HIGH popcount do not match the TL size. " +
-            "Partial masks are not allowed for PutFull"
+              "Partial masks are not allowed for PutFull"
           )
         }
         val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
@@ -1476,17 +1590,25 @@ class MemTraceLogger(
 
     // stats
     val numReqsThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
     val numRespsThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
     val reqBytesThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
-        b0 + b1
-      }
+      laneReqs
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
+          b0 + b1
+        }
     val respBytesThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
-        b0 + b1
-      }
+      laneResps
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
+          b0 + b1
+        }
     numReqs := numReqs + numReqsThisCycle
     numResps := numResps + numRespsThisCycle
     reqBytes := reqBytes + reqBytesThisCycle
@@ -1496,7 +1618,10 @@ class MemTraceLogger(
     //
     // This is a clunky workaround of the fact that Chisel doesn't allow partial
     // assignment to a bitfield range of a wide signal.
-    def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
+    def flattenTrace(
+        simIO: Bundle with HasTraceLine,
+        perLane: Vec[TraceLine]
+    ) = {
       // these will get optimized out
       val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
       val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
@@ -1592,8 +1717,14 @@ object TLPrintf {
       tlData: UInt,
       reqData: UInt
   ) = {
-    printf(s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
-      source, address, size, mask, is_store)
+    printf(
+      s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
+      source,
+      address,
+      size,
+      mask,
+      is_store
+    )
     when(is_store) {
       printf(", tlData=%x, reqData=%x", tlData, reqData)
     }
@@ -1604,7 +1735,7 @@ object TLPrintf {
 // Synthesizable unit tests
 
 class DummyDriver(config: CoalescerConfig)(implicit p: Parameters)
-  extends LazyModule {
+    extends LazyModule {
   val laneNodes = Seq.tabulate(config.numLanes) { i =>
     val clientParam = Seq(
       TLMasterParameters.v1(
@@ -1640,7 +1771,10 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
     // generate dummy traffic to coalescer to prevent it from being optimized
     // out during synthesis
     val address = Wire(UInt(config.addressWidth.W))
-    address := Cat((finishCounter + (lane.U % 3.U)), 0.U(config.wordSizeWidth.W))
+    address := Cat(
+      (finishCounter + (lane.U % 3.U)),
+      0.U(config.wordSizeWidth.W)
+    )
     val (tl, edge) = node.out(0)
     val (legal, bits) = edge.Put(
       fromSource = sourceIdCounter,
@@ -1657,11 +1791,13 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
     tl.e.valid := false.B
   }
 
-  val dataSum = outer.laneNodes.map { node =>
-    val tl = node.out(0)._1
-    val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
-    data
-  }.reduce (_ +& _)
+  val dataSum = outer.laneNodes
+    .map { node =>
+      val tl = node.out(0)._1
+      val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
+      data
+    }
+    .reduce(_ +& _)
   // this doesn't make much sense, but it prevents the entire uncoalescer from
   // being optimized away
   finishCounter := finishCounter + dataSum
@@ -1680,8 +1816,10 @@ class DummyCoalescer(implicit p: Parameters) extends LazyModule {
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
     )
   )
 
@@ -1704,7 +1842,8 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
 }
 
 // tracedriver --> coalescer --> tracelogger --> tlram
-class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule {
+class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters)
+    extends LazyModule {
   val numLanes = p(SIMTCoreKey).get.nLanes
   val config = defaultConfig.copy(numLanes = numLanes)
 
@@ -1713,14 +1852,18 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
     new MemTraceLogger(numLanes, filename, loggerName = "coreside")
   )
   val coal = LazyModule(new CoalescingUnit(config))
-  val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside"))
+  val memSideLogger = LazyModule(
+    new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")
+  )
   val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
     LazyModule(
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
     )
   )
 
@@ -1751,8 +1894,9 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
   }
 }
 
-class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters)
-    extends UnitTest(timeout) {
+class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit
+    p: Parameters
+) extends UnitTest(timeout) {
   val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
@@ -1770,8 +1914,10 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << defaultConfig.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << defaultConfig.dataBusWidth)
+      )
     )
   )
 
@@ -1785,13 +1931,13 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
   }
 }
 
-class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) {
+class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
+    extends UnitTest(timeout) {
   val dut = Module(LazyModule(new TLRAMCoalescer).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
 }
 
-
 ////////////
 ////////////
 ////////////
@@ -1853,26 +1999,10 @@ class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends La
     val node = TLIdentityNode()
     node :=* outputXbar.node
 
-    val nonCoalEntryT = new ReqQueueEntry(
-                                log2Ceil(config.numOldSrcIds),
-                                config.wordSizeWidth,
-                                config.addressWidth,
-                                config.wordSizeInBytes * 8
-                              )
-    val coalEntryT    = new ReqQueueEntry(
-                                log2Ceil(config.numOldSrcIds),
-                                log2Ceil(config.maxCoalLogSize),
-                                config.addressWidth,
-                                (1 << config.maxCoalLogSize) * 8
-                              )
-    val respNonCoalEntryT = new RespQueueEntry(
-                                log2Ceil(config.numOldSrcIds),
-                                config.wordSizeWidth,
-                                config.wordSizeInBytes * 8
-                              )
-
-    val respCoalBundleT   = new CoalescedResponseBundle(config)
-    
+    val nonCoalEntryT = new NonCoalescedRequest(config)
+    val coalEntryT    = new CoalescedRequest(config)
+    val respNonCoalEntryT = new NonCoalescedResponse(config)
+    val respCoalBundleT   = new CoalescedResponse(config)
 
     lazy val module = new CoalescerXbarImpl(
       this, config, nonCoalEntryT, coalEntryT, respNonCoalEntryT, respCoalBundleT)
@@ -1883,10 +2013,10 @@ class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends La
 
 class CoalescerXbarImpl(outer: CoalescerXbar, 
                       config: CoalescerConfig,
-                      nonCoalEntryT: ReqQueueEntry, 
-                      coalEntryT: ReqQueueEntry,
-                      respNonCoalEntryT: RespQueueEntry, 
-                      respCoalBundleT: CoalescedResponseBundle
+                      nonCoalEntryT: Request, 
+                      coalEntryT: Request,
+                      respNonCoalEntryT: Response, 
+                      respCoalBundleT: CoalescedResponse
       ) extends LazyModuleImp(outer){
 
 
@@ -1957,11 +2087,3 @@ class CoalescerXbarImpl(outer: CoalescerXbar,
 
 
   }
-
-
-
-
-
-
-
-
diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala
index 33b08f1..960b12e 100644
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -180,12 +180,12 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI
   )
 
   val reqQueueEnqReady =  peekIn(0).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType))
-  val reqQueueEnqBits =   peekIn(1).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(x.cloneType))
+  val reqQueueEnqBits =   peekIn(1).asInstanceOf[Seq[Request]].map(x => IO(x.cloneType))
   val reqQueueEnqValid =  peekIn(2).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType))
-  val reqQueueDeqBits =   peekIn(3).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(Output(x.cloneType)))
+  val reqQueueDeqBits =   peekIn(3).asInstanceOf[Seq[Request]].map(x => IO(Output(x.cloneType)))
   val reqQueueDeqValid =  peekIn(4).asInstanceOf[Seq[Bool]].map(x => IO(Output(x.cloneType)))
   val coalReqReady =      IO(Output(peekIn(5).asInstanceOf[Bool].cloneType))
-  val coalReqBits =       IO(Output(peekIn(6).asInstanceOf[ReqQueueEntry].cloneType))
+  val coalReqBits =       IO(Output(peekIn(6).asInstanceOf[Request].cloneType))
   val coalReqValid =      IO(Output(peekIn(7).asInstanceOf[Bool].cloneType))
   val coalInvalidate =    IO(Output(peekIn(8).asInstanceOf[Valid[Vec[UInt]]].cloneType))
 
@@ -759,67 +759,70 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
   }*/
 }
 
-object uncoalescerTestConfig extends CoalescerConfig(
-  enable = true,
-  numLanes = 4,
-  queueDepth = 2,
-  waitTimeout = 8,
-  addressWidth = 24,
-  dataBusWidth = 5,
-  // watermark = 2,
-  wordSizeInBytes = 4,
-  numOldSrcIds = 16,
-  numNewSrcIds = 4,
-  respQueueDepth = 4,
-  coalLogSizes = Seq(4),
-  sizeEnum = DefaultInFlightTableSizeEnum,
-  numCoalReqs = 1,
-  numArbiterOutputPorts = 4,
-  bankStrideInBytes = 64,
-)
-
 class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   behavior of "uncoalescer"
-  val numLanes = 4
-  val numPerLaneReqs = 2
-  val sourceWidth = 2
-  val sizeWidth = 2
-  // 16B coalescing size
-  val coalDataWidth = 128
-  val numInflightCoalRequests = 4
+  object uncoalescerTestConfig extends CoalescerConfig(
+    enable = true,
+    numLanes = 4,
+    queueDepth = 2,
+    waitTimeout = 8,
+    addressWidth = 24,
+    dataBusWidth = 4, // 128 bit data bus
+    wordSizeInBytes = 4,
+    numOldSrcIds = 16,
+    numNewSrcIds = 4,
+    respQueueDepth = 4,
+    coalLogSizes = Seq(4),
+    sizeEnum = DefaultInFlightTableSizeEnum,
+    numCoalReqs = 1,
+    numArbiterOutputPorts = 4,
+    bankStrideInBytes = 64,
+  )
+
+  val config = uncoalescerTestConfig
+
+  val nonCoalReqT = new NonCoalescedRequest(config)
+  val coalReqT = new CoalescedRequest(config)
 
   it should "work in general case" in {
-    test(new Uncoalescer(uncoalescerTestConfig))
+    test(new Uncoalescer(config, nonCoalReqT, coalReqT))
     // vcs helps with simulation time, but sometimes errors with
     // "mutation occurred during iteration" java error
     // .withAnnotations(Seq(VcsBackendAnnotation))
     { c =>
+      // 4 lanes, queue depth 2
+      c.io.windowElts(0)(0).op.poke(0.U)
+      c.io.windowElts(0)(0).source.poke(1.U)
+      c.io.windowElts(0)(0).address.poke(0x4.U)
+      c.io.windowElts(0)(0).size.poke(2.U)
+      c.io.windowElts(0)(1).op.poke(0.U)
+      c.io.windowElts(0)(1).source.poke(2.U)
+      c.io.windowElts(0)(1).address.poke(0x4.U) // two reqs from one lane
+      c.io.windowElts(0)(1).size.poke(2.U)
+      c.io.windowElts(2)(0).op.poke(0.U)
+      c.io.windowElts(2)(0).source.poke(2.U)
+      c.io.windowElts(2)(0).address.poke(0x8.U)
+      c.io.windowElts(2)(0).size.poke(2.U)
+      c.io.windowElts(2)(1).op.poke(0.U)
+      c.io.windowElts(2)(1).source.poke(2.U)
+      c.io.windowElts(2)(1).address.poke(0xc.U)
+      c.io.windowElts(2)(1).size.poke(2.U)
+      // indicate lane 0 and 2 are used for coalescing
+      c.io.invalidate.valid.poke(true.B)
+      c.io.invalidate.bits(0).poke(0x3.U) // 2'b11 for depth=2
+      c.io.invalidate.bits(1).poke(0x0.U)
+      c.io.invalidate.bits(2).poke(0x3.U)
+      c.io.invalidate.bits(3).poke(0x0.U)
+
       val sourceId = 0.U
-      val four = c.io.newEntry.sizeEnumT.FOUR
-      c.io.coalReqValid.poke(true.B)
-      c.io.newEntry.source.poke(sourceId)
-      c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(0).source.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(1).source.poke(2.U)
-      c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) // same offset to different lanes
-      c.io.newEntry.lanes(0).reqs(1).sizeEnum.poke(four)
-      c.io.newEntry.lanes(1).reqs(0).valid.poke(false.B)
-      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(2).reqs(1).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(1).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(1).offset.poke(3.U)
-      c.io.newEntry.lanes(2).reqs(1).sizeEnum.poke(four)
-      c.io.newEntry.lanes(3).reqs(0).valid.poke(false.B)
+      c.io.coalReq.valid.poke(true.B)
+      c.io.coalReq.bits.source.poke(sourceId)
+      c.io.coalReq.ready.expect(true.B)
 
       c.clock.step()
 
-      c.io.coalReqValid.poke(false.B)
+      c.io.coalReq.valid.poke(false.B)
+      c.io.invalidate.valid.poke(false.B)
 
       c.clock.step()
 
@@ -848,37 +851,42 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   }
 
   it should "uncoalesce when coalesced to the same word offset" in {
-    test(new Uncoalescer(uncoalescerTestConfig))
+    test(new Uncoalescer(config, nonCoalReqT, coalReqT))
     // .withAnnotations(Seq(VcsBackendAnnotation))
     { c =>
+      // 4 lanes, queue depth 2
+      c.io.windowElts(0)(0).op.poke(0.U)
+      c.io.windowElts(0)(0).source.poke(0.U)
+      c.io.windowElts(0)(0).address.poke(0x4.U)
+      c.io.windowElts(0)(0).size.poke(2.U)
+      c.io.windowElts(1)(0).op.poke(0.U)
+      c.io.windowElts(1)(0).source.poke(1.U)
+      c.io.windowElts(1)(0).address.poke(0x4.U) // two reqs from one lane
+      c.io.windowElts(1)(0).size.poke(2.U)
+      c.io.windowElts(2)(0).op.poke(0.U)
+      c.io.windowElts(2)(0).source.poke(2.U)
+      c.io.windowElts(2)(0).address.poke(0x4.U)
+      c.io.windowElts(2)(0).size.poke(2.U)
+      c.io.windowElts(3)(0).op.poke(0.U)
+      c.io.windowElts(3)(0).source.poke(3.U)
+      c.io.windowElts(3)(0).address.poke(0x4.U)
+      c.io.windowElts(3)(0).size.poke(2.U)
+      // indicate lanes used for coalescing
+      c.io.invalidate.valid.poke(true.B)
+      c.io.invalidate.bits(0).poke(0x1.U) // 2'b01 for enabling head
+      c.io.invalidate.bits(1).poke(0x1.U)
+      c.io.invalidate.bits(2).poke(0x1.U)
+      c.io.invalidate.bits(3).poke(0x1.U)
+
       val sourceId = 0.U
-      val four = c.io.newEntry.sizeEnumT.FOUR
-      c.io.coalReqValid.poke(true.B)
-      c.io.newEntry.source.poke(sourceId)
-      c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(0).source.poke(0.U)
-      c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(0).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(1).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(1).reqs(0).source.poke(1.U)
-      c.io.newEntry.lanes(1).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(1).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(1).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(2).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(3).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(3).reqs(0).source.poke(3.U)
-      c.io.newEntry.lanes(3).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(3).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(3).reqs(1).valid.poke(false.B)
+      c.io.coalReq.valid.poke(true.B)
+      c.io.coalReq.bits.source.poke(sourceId)
+      c.io.coalReq.ready.expect(true.B)
 
       c.clock.step()
 
-      c.io.coalReqValid.poke(false.B)
+      c.io.coalReq.valid.poke(false.B)
+      c.io.invalidate.valid.poke(false.B)
 
       c.clock.step()
 
@@ -908,138 +916,3 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
     }
   }
 }
-
-class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester {
-  behavior of "inflight coalesced request table"
-  val numLanes = 4
-  val numPerLaneReqs = 2
-  val sourceWidth = 2
-  val entries = 4
-
-  val offsetBits = 4
-  val sizeBits = 2
-
-  val inflightCoalReqTableEntry =
-    new InflightCoalReqTableEntry(
-      numLanes,
-      numPerLaneReqs,
-      sourceWidth,
-      offsetBits,
-      testConfig.sizeEnum
-    )
-
-  // it should "stop enqueueing when full" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries)) { c =>
-  //     // fill up the table
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(sourceId.U)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //       c.io.lookup.ready.poke(false.B)
-  //       c.clock.step()
-  //     }
-
-  //     // now cannot enqueue any more
-  //     c.io.enq.ready.expect(false.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(0.U)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //     c.clock.step()
-  //     c.io.enq.ready.expect(false.B)
-
-  //     // try to lookup all existing entries
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.valid.poke(false.B)
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(sourceId)
-  //       c.io.lookup.valid.expect(true.B)
-  //       c.io.lookup.bits.expect(sourceId)
-  //       c.clock.step()
-  //     }
-
-  //     // now the table should be empty
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.valid.poke(false.B)
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(sourceId)
-  //       c.io.lookup.valid.expect(false.B)
-  //       c.clock.step()
-  //     }
-  //   }
-  // }
-  // it should "lookup matching entry" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries))
-  //     .withAnnotations(Seq(WriteVcdAnnotation)) { c =>
-  //       c.reset.poke(true.B)
-  //       c.clock.step(10)
-  //       c.reset.poke(false.B)
-
-  //       // enqueue one entry to not match at 0th index
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(0.U)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //       c.clock.step()
-
-  //       val targetSourceId = 1.U
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(targetSourceId)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //       c.clock.step()
-
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(targetSourceId)
-  //       c.io.lookup.valid.expect(true.B)
-  //       c.io.lookup.bits.expect(targetSourceId)
-
-  //       c.clock.step()
-
-  //       // test if matching entry dequeues after 1 cycle
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(targetSourceId)
-  //       c.io.lookup.valid.expect(false.B)
-  //     }
-  // }
-  // it should "handle lookup and enqueue at the same time" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries)) { c =>
-  //     // fill up the table
-  //     val targetSourceId = 1.U
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(0.U)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.clock.step()
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(targetSourceId)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.clock.step()
-
-  //     // do both enqueue and lookup at the same cycle
-  //     val enqSourceId = 2.U
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(enqSourceId)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.io.lookup.ready.poke(true.B)
-  //     c.io.lookupSourceId.poke(targetSourceId)
-
-  //     c.clock.step()
-  //   }
-  // }
-}