Merge branch 'graphics' of https://github.com/hansungk/rocket-chip into graphics

2023-05-11 21:50:42 -07:00
parent f0a7fd852a 9b7080a852
commit 4e4b993287
2 changed files with 498 additions and 503 deletions
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -4,7 +4,6 @@ package freechips.rocketchip.tilelink

 import chisel3._
 import chisel3.util._
-import chisel3.experimental.ChiselEnum
 import org.chipsalliance.cde.config.{Parameters, Field}
 import freechips.rocketchip.diplomacy._
 // import freechips.rocketchip.devices.tilelink.TLTestRAM
@@ -42,6 +41,13 @@ object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum {
  }
 }

+// Mapping to reference model param names
+//  numLanes: Int, <-> config.NUM_LANES
+//  numPerLaneReqs: Int, <-> config.DEPTH
+//  sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)
+//  sizeWidth: Int, <-> config.sizeEnum.width
+//  coalDataWidth: Int, <-> (1 << config.MAX_SIZE)
+//  numInflightCoalRequests: Int <-> config.NUM_NEW_IDS
 case class CoalescerConfig(
  enable: Boolean,        // globally enable or disable coalescing
  numLanes: Int,          // number of lanes (or threads) in a warp
@@ -135,7 +141,11 @@ class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends La
  lazy val module = new CoalescingUnitImp(this, config)
 }

-class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int) extends Bundle {
+// Protocol-agnostic bundles that represent a request and a response to the
+// coalescer.
+
+class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int)
+    extends Bundle {
  require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
  val op = UInt(1.W) // 0=READ 1=WRITE
  val address = UInt(addressWidth.W)
@@ -149,7 +159,7 @@ class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWid
      fromSource = this.source,
      toAddress = this.address,
      lgSize = this.size,
-      data = this.data,
+      data = this.data
    )
    val (glegal, gbits) = edgeOut.Get(
      fromSource = this.source,
@@ -162,8 +172,24 @@ class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWid
    bits
  }
 }
+case class NonCoalescedRequest(config: CoalescerConfig)
+    extends Request(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      addressWidth = config.addressWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
+case class CoalescedRequest(config: CoalescerConfig)
+    extends Request(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      addressWidth = config.addressWidth,
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )

-class RespQueueEntry(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle {
+class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int)
+    extends Bundle {
+  require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
  val op = UInt(1.W) // 0=READ 1=WRITE
  val size = UInt(sizeWidth.W)
  val source = UInt(sourceWidth.W)
@@ -191,10 +217,23 @@ class RespQueueEntry(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends B
    this.error := bundle.denied
  }
 }
+case class NonCoalescedResponse(config: CoalescerConfig)
+    extends Response(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
+case class CoalescedResponse(config: CoalescerConfig)
+    extends Response(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )

 // If `ignoreInUse`, just keep giving out new IDs without checking if it is in
 // use.
-class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module {
+class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true)
+    extends Module {
  val io = IO(new Bundle {
    val gen = Input(Bool())
    val reclaim = Input(Valid(UInt(sourceWidth.W)))
@@ -221,7 +260,8 @@ class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) e
  }
 }

-class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) extends Module {
+class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
+    extends Module {
  val io = IO(new Bundle {
    val queue = new Bundle {
      val enq = Vec(config.numLanes, DeqIO(gen.cloneType))
@@ -238,7 +278,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 //  eltPrototype.valid := false.B

  val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen))))
-  val writePtr = RegInit(VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))))
+  val writePtr = RegInit(
+    VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W)))
+  )
  val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B)))

  private def resetElts = {
@@ -265,14 +307,17 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
  // current cycle.
  //
  // shift hint is when the heads have no more coalescable left this or next cycle
-  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) =>
+  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0)))
+    .map { case (c, inv) =>
      c && !(io.invalidate.valid && inv)
-  }.reduce(_ || _)
+    }
+    .reduce(_ || _)
  val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
  // valid && !fire means we enable enqueueing to a full queue, provided the
  // arbiter is taking away all remaining valid queue heads in the next cycle so
  // that we make space for the entire next warp.
-  val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
+  val syncedDeqValidNextCycle =
+    io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)

  for (i <- 0 until config.numLanes) {
    val enq = io.queue.enq(i)
@@ -299,7 +344,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
          elt.valid := false.B
        } else {
          elt.bits := elts(i)(j + 1).bits
-          elt.valid := elts(i)(j + 1).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
+          elt.valid := elts(i)(
+            j + 1
+          ).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
        }
      }
      // reset dequeue mask when new entries are shifted in
@@ -331,7 +378,8 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
  // When doing spatial-only coalescing, queues should never drift from each
  // other, i.e. the queue heads should always contain mem requests from the
  // same instruction.
-  val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
+  val queueInSync =
+    controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
      writePtr.map(_ === writePtr.head).reduce(_ && _)
  assert(queueInSync, "shift queue lanes are not in sync")

@@ -340,18 +388,23 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 }

 // Software model: coalescer.py
-class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
-                    config: CoalescerConfig) extends Module {
+class MonoCoalescer(
+    config: CoalescerConfig,
+    coalLogSize: Int,
+    queueT: CoalShiftQueue[NonCoalescedRequest]
+) extends Module {
  val io = IO(new Bundle {
-    val window = Input(windowT.io.cloneType)
+    val window = Input(queueT.io.cloneType)
    val results = Output(new Bundle {
      val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W))
      val baseAddr = Output(UInt(config.addressWidth.W))
      val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
      // number of entries matched with this leader lane's head.
      // maximum is numLanes * queueDepth
-      val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
-      val coverageHits = Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
+      val matchCount =
+        Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
+      val coverageHits =
+        Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
      val canCoalesce = Output(Vec(config.numLanes, Bool()))
    })
  })
@@ -366,8 +419,12 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],

  def printQueueHeads = {
    leaders.zipWithIndex.foreach { case (head, i) =>
-      printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
-        leadersValid(i), head.source, head.address)
+      printf(
+        s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
+        leadersValid(i),
+        head.source,
+        head.address
+      )
    }
  }
  // when (leadersValid.reduce(_ || _)) {
@@ -376,7 +433,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],

  val size = coalLogSize
  val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
-  def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = {
+  def canMatch(req0: Request, req0v: Bool, req1: Request, req1v: Bool): Bool = {
    (req0.op === req1.op) &&
    (req0v && req1v) &&
    ((req0.address & this.addrMask) === (req1.address & this.addrMask))
@@ -385,10 +442,13 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
  // Gives a 2-D table of Bools representing match at every queue entry,
  // for each lane (so 3-D in total).
  // dimensions: (leader lane, follower lane, follower entry)
-  val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) =>
-    (io.window.elts zip io.window.mask).map { case (followers, followerValids) =>
+  val matchTablePerLane = (leaders zip leadersValid).map {
+    case (leader, leaderValid) =>
+      (io.window.elts zip io.window.mask).map {
+        case (followers, followerValids) =>
          // compare leader's head against follower's every queue entry
-      (followers zip followerValids.asBools).map { case (follower, followerValid) =>
+          (followers zip followerValids.asBools).map {
+            case (follower, followerValid) =>
              canMatch(follower, followerValid, leader, leaderValid)
            // FIXME: disabling halving optimization because it does not give the
            // correct per-lane coalescable indication to the shift queue
@@ -401,18 +461,23 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
  }

  val matchCounts = matchTablePerLane.map(table =>
-      table.map(PopCount(_)) // sum up each column
-           .reduce(_ +& _))
+    table
+      .map(PopCount(_)) // sum up each column
+      .reduce(_ +& _)
+  )
  val canCoalesce = matchCounts.map(_ > 1.U)

  // Elect the leader that has the most match counts.
  // TODO: potentially expensive: magnitude comparator
  def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
-    matchCounts.zipWithIndex.map {
-      case (c, i) => (c, i.U)
-    }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
+    matchCounts.zipWithIndex
+      .map { case (c, i) =>
+        (c, i.U)
+      }
+      .reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
        (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
-    }._2
+      }
+      ._2
  }
  // Elect leader by choosing the smallest-index lane that has a valid
  // match, i.e. using priority encoder.
@@ -422,7 +487,8 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
  val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts)

  val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux
-  // matchTable for the chosen lane, but converted to a Vec[UInt]
+  // matchTable for the chosen lane, but each column converted to bitflags,
+  // i.e. Vec[UInt]
  val chosenMatches = VecInit(matchTablePerLane.map { table =>
    VecInit(table.map(VecInit(_).asUInt))
  })(chosenLeaderIdx)
@@ -431,14 +497,17 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
  // coverage calculation
  def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordSizeWidth)
  // 2-D table flattened to 1-D
-  val offsets = io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
+  val offsets =
+    io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
  val valids = chosenMatches.flatMap(_.asBools)
  // indicates for each word in the coalesced chunk whether it is accessed by
  // any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four
  // words in the coalesced data coming back will be accessed by some request
  // and we've reached 100% bandwidth utilization.
  val hits = Seq.tabulate(1 << (size - config.wordSizeWidth)) { target =>
-    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+    (offsets zip valids)
+      .map { case (offset, valid) => valid && (offset === target.U) }
+      .reduce(_ || _)
  }

  // debug prints
@@ -471,20 +540,28 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
 // coalesced request out of all possible combinations.
 //
 // Software model: coalescer.py
-class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry,
-                     config: CoalescerConfig) extends Module {
+class MultiCoalescer(
+    config: CoalescerConfig,
+    queueT: CoalShiftQueue[NonCoalescedRequest],
+    coalReqT: Request,
+) extends Module {
+  val invalidateT = Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))
  val io = IO(new Bundle {
    // coalescing window, connected to the contents of the request queues
-    val window = Input(windowT.io.cloneType)
+    val window = Input(queueT.io.cloneType)
    // generated coalesced request
    val coalReq = DecoupledIO(coalReqT.cloneType)
-    // invalidate signals going into each request queue's head
-    val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
-    // whether a lane is coalescable
+    // invalidate signals going into each request queue's head.  Lanes with
+    // high invalidate bits are what became coalesced into the new request.
+    val invalidate = Output(invalidateT)
+    // whether a lane is coalescable.  This is used to output non-coalescable
+    // lanes to the arbiter so they can be flushed to downstream.
    val coalescable = Output(Vec(config.numLanes, Bool()))
  })

-  val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
+  val coalescers = config.coalLogSizes.map(size =>
+    Module(new MonoCoalescer(config, size, queueT))
+  )
  coalescers.foreach(_.io.window := io.window)

  def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
@@ -509,7 +586,8 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
  val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
  val chosenValid = Wire(Bool())
  // minimum 25% coverage
-  val minCoverage = 1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
+  val minCoverage =
+    1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))

  when(normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
    chosenSizeIdx := argMax(normalizedHits)
@@ -541,9 +619,14 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
  val flatMatches = chosenBundle.matchOH.flatMap(_.asBools)

  // check for word alignment in addresses
-  assert(io.window.elts.flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U)).zip(
-    io.window.mask.flatMap(_.asBools)).map { case (aligned, valid) => (!valid) || aligned }.reduce(_ || _),
-    "one or more addresses used for coalescing is not word-aligned")
+  assert(
+    io.window.elts
+      .flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U))
+      .zip(io.window.mask.flatMap(_.asBools))
+      .map { case (aligned, valid) => (!valid) || aligned }
+      .reduce(_ || _),
+    "one or more addresses used for coalescing is not word-aligned"
+  )

  // note: this is word-level coalescing. if finer granularity is needed, need to modify code
  val numWords = (1.U << (chosenSize - config.wordSizeWidth.U)).asUInt
@@ -558,18 +641,29 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
    val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
      // note: ANDing against addrMask is to conform to active byte lanes requirements
      // if aligning to LSB suffices, we should add the bitwise AND back
-      m && ((req.address(config.maxCoalLogSize - 1, config.wordSizeWidth)/* & addrMask*/) === i.U)
+      m && ((req.address(
+        config.maxCoalLogSize - 1,
+        config.wordSizeWidth
+      ) /* & addrMask*/ ) === i.U)
    }
    // TODO: SW uses priority encoder, not sure about behavior of MuxCase
-    data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) =>
+    data(i) := MuxCase(
+      DontCare,
+      flatReqs.zip(sel).map { case (req, s) =>
        s -> req.data
-    })
-    mask(i) := MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) =>
+      }
+    )
+    mask(i) := MuxCase(
+      0.U,
+      flatReqs.zip(sel).map { case (req, s) =>
        s -> req.mask
-    })
+      }
+    )
  }

-  val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds)))
+  val sourceGen = Module(
+    new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds))
+  )
  sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created
  sourceGen.io.reclaim.valid := false.B // not used
  sourceGen.io.reclaim.bits := DontCare // not used
@@ -587,7 +681,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
  io.invalidate.bits := chosenBundle.matchOH
  io.invalidate.valid := io.coalReq.fire // invalidate only when fire

-  io.coalescable := coalescers.map(_.io.results.canCoalesce.asUInt).reduce(_ | _).asBools
+  io.coalescable := coalescers
+    .map(_.io.results.canCoalesce.asUInt)
+    .reduce(_ | _)
+    .asBools

  dontTouch(io.invalidate) // debug

@@ -599,26 +696,36 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
  if (!config.enable) disable
 }

-class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) {
-  require(outer.cpuNode.in.length == config.numLanes,
+class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
+    extends LazyModuleImp(outer) {
+  require(
+    outer.cpuNode.in.length == config.numLanes,
    s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " +
-    s"config.numLanes (${config.numLanes})")
-  require(outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
+      s"config.numLanes (${config.numLanes})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
    s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " +
-    s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})")
-  require(outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
+      s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
    s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
-    s"mismatch with config.addressWidth (${config.addressWidth})")
+      s"mismatch with config.addressWidth (${config.addressWidth})"
+  )
+  require(
+    config.maxCoalLogSize <= config.dataBusWidth,
+    "multi-beat coalesced reads/writes are currently not supported"
+  )

-  val sourceWidth = outer.cpuNode.in.head._1.params.sourceBits
-  // note we are using word size. assuming all coalescer inputs are word sized
-  val reqQueueEntryT = new ReqQueueEntry(sourceWidth, config.wordSizeWidth,
-    config.addressWidth, (config.wordSizeInBytes * 8))
-  val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config))
+  val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
+  val nonCoalReqT = new NonCoalescedRequest(config)
+  val reqQueues = Module(
+    new CoalShiftQueue(nonCoalReqT, config.queueDepth, config)
+  )

-  val coalReqT = new ReqQueueEntry(log2Ceil(config.numNewSrcIds), log2Ceil(config.maxCoalLogSize),
-    config.addressWidth, (1 << config.maxCoalLogSize) * 8)
-  val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
+  val coalReqT = new CoalescedRequest(config)
+  val coalescer = Module(new MultiCoalescer(config, reqQueues, coalReqT))
  coalescer.io.window := reqQueues.io
  reqQueues.io.coalescable := coalescer.io.coalescable
  reqQueues.io.invalidate := coalescer.io.invalidate
@@ -634,7 +741,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
    case (((tlIn, _), (tlOut, edgeOut)), lane) =>
      // Request queue
-      val req = Wire(reqQueueEntryT)
+      val req = Wire(nonCoalReqT)

      req.op := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
      req.source := tlIn.a.bits.source
@@ -691,6 +798,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
  tlCoal.e.valid := false.B

+  require(
+    tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
+    s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
+      + s" (${(1 << config.dataBusWidth) * 8})"
+  )

  // ===========================================================================
  // Response flow
@@ -703,8 +815,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  // coalesced request.  Upper bound is min(DEPTH, 2**sourceWidth).
  val numPerLaneReqs = config.queueDepth

-  val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize),
-    (1 << config.maxCoalLogSize) * 8)
+  // FIXME: no need to contain maxCoalLogSize data
+  val respQueueEntryT = new Response(
+    oldSourceWidth,
+    log2Ceil(config.maxCoalLogSize),
+    (1 << config.maxCoalLogSize) * 8
+  )
  val respQueues = Seq.tabulate(config.numLanes) { _ =>
    Module(
      new MultiPortQueue(
@@ -776,57 +892,26 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
      dontTouch(tlOut.d)
  }

-  // Construct new entry for the inflight table
-  // FIXME: don't instantiate inflight table entry type here.  It leaks the table's impl
-  // detail to the coalescer
-
-  // richard: I think a good idea is to pass Valid[ReqQueueEntry] generated by
-  // the coalescer directly into the uncoalescer, so that we can offload the
-  // logic to generate the Inflight Entry into the uncoalescer, where it should be.
-  // this also reduces top level clutter.
-
-  val uncoalescer = Module(new Uncoalescer(config))
-
-  val newEntry = Wire(uncoalescer.inflightTable.entryT)
-  newEntry.source := coalescer.io.coalReq.bits.source
-
-  assert (config.maxCoalLogSize <= config.dataBusWidth,
-    "multi-beat coalesced reads/writes are currently not supported")
-  assert (
-    tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
-    s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
-    + s" (${(1 << config.dataBusWidth) * 8})"
+  val uncoalescer = Module(
+    new Uncoalescer(config, nonCoalReqT, coalReqT)
  )
+  // connect coalesced request that is newly generated and being recorded in
+  // the uncoalescer
+  uncoalescer.io.coalReq <> coalescer.io.coalReq
+  uncoalescer.io.invalidate := coalescer.io.invalidate
  val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
-  // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
-  // coalescer to every (numLanes * queueDepth) entry in the inflight table.
-  (newEntry.lanes zip coalescer.io.invalidate.bits).zipWithIndex
-    .foreach { case ((laneEntry, laneInv), lane) =>
-      (laneEntry.reqs zip laneInv.asBools).zipWithIndex
-        .foreach { case ((reqEntry, inv), i) =>
-          val req = reqQueues.io.elts(lane)(i)
-          when ((coalescer.io.invalidate.valid && inv)) {
-            printf(s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", req.source)
-          }
-          reqEntry.valid := (coalescer.io.invalidate.valid && inv)
-          reqEntry.source := req.source
-          reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordSizeWidth)
-          reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
-          // TODO: load/store op
-        }
-    }
-  dontTouch(newEntry)
-
-  uncoalescer.io.coalReqValid := coalescer.io.coalReq.valid
-  uncoalescer.io.newEntry := newEntry
+  uncoalescer.io.windowElts := reqQueues.io.elts
+  // connect coalesced response going into the uncoalescer, ready to be
+  // uncoalesced
  // Cleanup: custom <>?
  uncoalescer.io.coalResp.valid := tlCoal.d.valid
-  uncoalescer.io.coalResp.bits.source := tlCoal.d.bits.source
-  uncoalescer.io.coalResp.bits.data := tlCoal.d.bits.data
+  uncoalescer.io.coalResp.bits.fromTLD(tlCoal.d.bits)
+  // uncoalescer backpressure
  tlCoal.d.ready := uncoalescer.io.coalResp.ready

  // Connect uncoalescer results back into each lane's response queue
-  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) =>
+  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach {
+    case ((q, perLaneResps), lane) =>
      perLaneResps.zipWithIndex.foreach { case (resp, i) =>
        // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
        // cache.  This should ideally not happen though.
@@ -853,67 +938,78 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  dontTouch(tlCoal.d)
 }

-// Protocol-agnostic bundle that represents a coalesced response.
-//
-// Having this makes it easier to:
-//   * do unit tests -- no need to deal with TileLink in the chiseltest code
-//   * adapt coalescer to custom protocols like a custom L1 cache interface.
-//
-// FIXME: overlaps with RespQueueEntry. Trait-ify
-class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle {
-  val source = UInt(log2Ceil(config.numNewSrcIds).W)
-  val data = UInt((8 * (1 << config.maxCoalLogSize)).W)
-
-  def fromTLD(bundle:TLBundleD): Unit = {
-    this.source := bundle.source
-    this.data   := bundle.data
-  }
-
-}
-
-class Uncoalescer(config: CoalescerConfig) extends Module {
-  // notes to hansung:
-  //  val numLanes: Int, <-> config.NUM_LANES
-  //  val numPerLaneReqs: Int, <-> config.DEPTH
-  //  val sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)
-  //  val sizeWidth: Int, <-> config.sizeEnum.width
-  //  val coalDataWidth: Int, <-> (1 << config.MAX_SIZE)
-  //  val numInflightCoalRequests: Int <-> config.NUM_NEW_IDS
+class Uncoalescer(
+    config: CoalescerConfig,
+    nonCoalReqT: NonCoalescedRequest,
+    coalReqT: CoalescedRequest,
+) extends Module {
  val inflightTable = Module(new InflightCoalReqTable(config))
  val io = IO(new Bundle {
-    val coalReqValid = Input(Bool())
-    // FIXME: receive ReqQueueEntry and construct newEntry inside uncoalescer
-    val newEntry = Input(inflightTable.entryT.cloneType)
-    val coalResp = Flipped(Decoupled(new CoalescedResponseBundle(config)))
+    // generated coalesced request, connected to the output of the coalescer.
+    val coalReq = Flipped(DecoupledIO(coalReqT.cloneType))
+    // invalidate signal coming out of coalescer.
+    val invalidate = Input(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
+    // coalescing window, connected to the contents of the request queues.
+    // Uncoalescer looks at the queue entries that got coalesced into `coalReq`
+    // in order to record which lanes this coalReq originally came from.
+    // We only care about window.elts because the coalescer would have made
+    // sure it only looked at the valid entries.
+    // TODO: duplicate type construction
+    val windowElts = Input(Vec(config.numLanes, Vec(config.queueDepth, nonCoalReqT)))
+    val coalResp = Flipped(Decoupled(new CoalescedResponse(config)))
    val uncoalResps = Output(
      Vec(
        config.numLanes,
-        Vec(
-          config.queueDepth,
-          ValidIO(
-            new RespQueueEntry(log2Ceil(config.numOldSrcIds), config.wordSizeWidth,
-              config.wordSizeInBytes * 8)
-          )
-        )
+        Vec(config.queueDepth, ValidIO(new NonCoalescedResponse(config)))
      )
    )
  })

-  // Populate inflight table
-  inflightTable.io.enq.valid := io.coalReqValid
-  inflightTable.io.enq.bits := io.newEntry
+  // Uncoalescer has to be always ready to accept and record new coalesced
+  // requests, so that it doesn't stall the coalescer.
+  io.coalReq.ready := true.B
+
+  // Construct a new entry for the inflight table using generated coalesced request
+  def generateInflightTableEntry: InflightCoalReqTableEntry = {
+    val newEntry = Wire(inflightTable.entryT)
+    newEntry.source := io.coalReq.bits.source
+    // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
+    // coalescer to every (numLanes * queueDepth) entry in the inflight table.
+    (newEntry.lanes zip io.invalidate.bits).zipWithIndex
+      .foreach { case ((laneEntry, laneInv), lane) =>
+        (laneEntry.reqs zip laneInv.asBools).zipWithIndex
+          .foreach { case ((reqEntry, inv), i) =>
+            val req = io.windowElts(lane)(i)
+            when((io.invalidate.valid && inv)) {
+              printf(
+                s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
+                req.source
+              )
+            }
+            reqEntry.valid := (io.invalidate.valid && inv)
+            reqEntry.source := req.source
+            reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordSizeWidth)
+            reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
+            // TODO: load/store op
+          }
+      }
+    assert(
+      !((io.coalReq.valid === true.B) && (io.coalResp.valid === true.B) &&
+        (newEntry.source === io.coalResp.bits.source)),
+      "inflight table: enqueueing and looking up the same srcId at the same cycle is not handled"
+    )
+    dontTouch(newEntry)
+
+    newEntry
+  }
+  inflightTable.io.enq.valid := io.coalReq.valid
+  inflightTable.io.enq.bits := generateInflightTableEntry

  // Look up the table with incoming coalesced responses
  inflightTable.io.lookup.ready := io.coalResp.valid
  inflightTable.io.lookupSourceId := io.coalResp.bits.source
  io.coalResp.ready := true.B // FIXME, see sw model implementation

-  assert(
-    !((io.coalReqValid === true.B) && (io.coalResp.valid === true.B) &&
-      (io.newEntry.source === io.coalResp.bits.source)),
-    "inflight table: enqueueing and looking up the same srcId at the same cycle is not handled"
-  )
-
  // Un-coalescing logic
  //
  def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, logSize: UInt): UInt = {
@@ -972,7 +1068,8 @@ class Uncoalescer(config: CoalescerConfig) extends Module {
 // split the coalesced response back to individual per-lane responses with the
 // right metadata.
 class InflightCoalReqTable(config: CoalescerConfig) extends Module {
-  val offsetBits = config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
+  val offsetBits =
+    config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
  val entryT = new InflightCoalReqTableEntry(
    config.numLanes,
    config.queueDepth,
@@ -1094,8 +1191,12 @@ object TLUtils {
 // `traceHasSource` is true if the input trace file has an additional source
 // ID column.  This is useful for using the output trace file genereated by
 // MemTraceLogger as the driver.
-class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource: Boolean = false)
-  (implicit p: Parameters) extends LazyModule {
+class MemTraceDriver(
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean = false
+)(implicit p: Parameters)
+    extends LazyModule {
  // Create N client nodes together
  val laneNodes = Seq.tabulate(config.numLanes) { i =>
    val clientParam = Seq(
@@ -1113,7 +1214,8 @@ class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource:
  val node = TLIdentityNode()
  laneNodes.foreach { l => node := l }

-  lazy val module = new MemTraceDriverImp(this, config, filename, traceHasSource)
+  lazy val module =
+    new MemTraceDriverImp(this, config, filename, traceHasSource)
 }

 trait HasTraceLine {
@@ -1136,9 +1238,12 @@ class TraceLine extends Bundle with HasTraceLine {
  val data = UInt(64.W)
 }

-class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename: String,
-  traceHasSource: Boolean)
-    extends LazyModuleImp(outer)
+class MemTraceDriverImp(
+    outer: MemTraceDriver,
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean
+) extends LazyModuleImp(outer)
    with UnitTestModule {
  // Current cycle mark to read from trace
  val traceReadCycle = RegInit(1.U(64.W))
@@ -1216,11 +1321,16 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
    sizeInBytes := (1.U) << req.size
    mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
    wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
-    val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
+    val wordAlignedAddress =
+      req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
    val wordAlignedSize = Mux(subword, 2.U, req.size)

-    val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numOldSrcIds),
-      ignoreInUse = false))
+    val sourceGen = Module(
+      new RoundRobinSourceGenerator(
+        log2Ceil(config.numOldSrcIds),
+        ignoreInUse = false
+      )
+    )
    sourceGen.io.gen := reqQ.io.deq.fire
    // assert(sourceGen.io.id.valid)

@@ -1229,7 +1339,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
      toAddress = hashToValidPhyAddr(wordAlignedAddress),
      lgSize = wordAlignedSize, // trace line already holds log2(size)
      // data should be aligned to beatBytes
-      data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
+      data =
+        (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
    )
    val (glegal, gbits) = edge.Get(
      fromSource = sourceGen.io.id.bits,
@@ -1288,9 +1399,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename

 class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
    extends BlackBox(
-      Map("FILENAME" -> filename,
+      Map(
+        "FILENAME" -> filename,
        "NUM_LANES" -> numLanes,
-          "HAS_SOURCE" -> (if (traceHasSource) 1 else 0))
+        "HAS_SOURCE" -> (if (traceHasSource) 1 else 0)
+      )
    )
    with HasBlackBoxResource {
  val traceLineT = new TraceLine
@@ -1304,7 +1417,8 @@ class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)

    // These names have to match declarations in the Verilog code, eg.
    // trace_read_address.
-    val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source
+    val trace_read =
+      new Bundle { // can't use HasTraceLine because this doesn't have source
        val ready = Input(Bool())
        val valid = Output(UInt(numLanes.W))
        // Chisel can't interface with Verilog 2D port, so flatten all lanes into
@@ -1476,15 +1590,23 @@ class MemTraceLogger(

    // stats
    val numReqsThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
    val numRespsThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
    val reqBytesThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
+      laneReqs
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
          b0 + b1
        }
    val respBytesThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
+      laneResps
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
          b0 + b1
        }
    numReqs := numReqs + numReqsThisCycle
@@ -1496,7 +1618,10 @@ class MemTraceLogger(
    //
    // This is a clunky workaround of the fact that Chisel doesn't allow partial
    // assignment to a bitfield range of a wide signal.
-    def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
+    def flattenTrace(
+        simIO: Bundle with HasTraceLine,
+        perLane: Vec[TraceLine]
+    ) = {
      // these will get optimized out
      val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
      val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
@@ -1592,8 +1717,14 @@ object TLPrintf {
      tlData: UInt,
      reqData: UInt
  ) = {
-    printf(s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
-      source, address, size, mask, is_store)
+    printf(
+      s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
+      source,
+      address,
+      size,
+      mask,
+      is_store
+    )
    when(is_store) {
      printf(", tlData=%x, reqData=%x", tlData, reqData)
    }
@@ -1640,7 +1771,10 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
    // generate dummy traffic to coalescer to prevent it from being optimized
    // out during synthesis
    val address = Wire(UInt(config.addressWidth.W))
-    address := Cat((finishCounter + (lane.U % 3.U)), 0.U(config.wordSizeWidth.W))
+    address := Cat(
+      (finishCounter + (lane.U % 3.U)),
+      0.U(config.wordSizeWidth.W)
+    )
    val (tl, edge) = node.out(0)
    val (legal, bits) = edge.Put(
      fromSource = sourceIdCounter,
@@ -1657,11 +1791,13 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
    tl.e.valid := false.B
  }

-  val dataSum = outer.laneNodes.map { node =>
+  val dataSum = outer.laneNodes
+    .map { node =>
      val tl = node.out(0)._1
      val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
      data
-  }.reduce (_ +& _)
+    }
+    .reduce(_ +& _)
  // this doesn't make much sense, but it prevents the entire uncoalescer from
  // being optimized away
  finishCounter := finishCounter + dataSum
@@ -1680,8 +1816,10 @@ class DummyCoalescer(implicit p: Parameters) extends LazyModule {
      // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
      // edges globally, by way of Diplomacy communicating the TL slave
      // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
    )
  )

@@ -1704,7 +1842,8 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
 }

 // tracedriver --> coalescer --> tracelogger --> tlram
-class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule {
+class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters)
+    extends LazyModule {
  val numLanes = p(SIMTCoreKey).get.nLanes
  val config = defaultConfig.copy(numLanes = numLanes)

@@ -1713,14 +1852,18 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
    new MemTraceLogger(numLanes, filename, loggerName = "coreside")
  )
  val coal = LazyModule(new CoalescingUnit(config))
-  val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside"))
+  val memSideLogger = LazyModule(
+    new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")
+  )
  val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
    LazyModule(
      // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
      // edges globally, by way of Diplomacy communicating the TL slave
      // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
    )
  )

@@ -1751,8 +1894,9 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
  }
 }

-class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters)
-    extends UnitTest(timeout) {
+class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit
+    p: Parameters
+) extends UnitTest(timeout) {
  val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
  dut.io.start := io.start
  io.finished := dut.io.finished
@@ -1770,8 +1914,10 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
      // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
      // edges globally, by way of Diplomacy communicating the TL slave
      // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << defaultConfig.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << defaultConfig.dataBusWidth)
+      )
    )
  )

@@ -1785,13 +1931,13 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
  }
 }

-class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) {
+class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
+    extends UnitTest(timeout) {
  val dut = Module(LazyModule(new TLRAMCoalescer).module)
  dut.io.start := io.start
  io.finished := dut.io.finished
 }

-
 ////////////
 ////////////
 ////////////
@@ -1853,26 +1999,10 @@ class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends La
    val node = TLIdentityNode()
    node :=* outputXbar.node

-    val nonCoalEntryT = new ReqQueueEntry(
-                                log2Ceil(config.numOldSrcIds),
-                                config.wordSizeWidth,
-                                config.addressWidth,
-                                config.wordSizeInBytes * 8
-                              )
-    val coalEntryT    = new ReqQueueEntry(
-                                log2Ceil(config.numOldSrcIds),
-                                log2Ceil(config.maxCoalLogSize),
-                                config.addressWidth,
-                                (1 << config.maxCoalLogSize) * 8
-                              )
-    val respNonCoalEntryT = new RespQueueEntry(
-                                log2Ceil(config.numOldSrcIds),
-                                config.wordSizeWidth,
-                                config.wordSizeInBytes * 8
-                              )
-
-    val respCoalBundleT   = new CoalescedResponseBundle(config)
-    
+    val nonCoalEntryT = new NonCoalescedRequest(config)
+    val coalEntryT    = new CoalescedRequest(config)
+    val respNonCoalEntryT = new NonCoalescedResponse(config)
+    val respCoalBundleT   = new CoalescedResponse(config)

    lazy val module = new CoalescerXbarImpl(
      this, config, nonCoalEntryT, coalEntryT, respNonCoalEntryT, respCoalBundleT)
@@ -1883,10 +2013,10 @@ class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends La

 class CoalescerXbarImpl(outer: CoalescerXbar, 
                      config: CoalescerConfig,
-                      nonCoalEntryT: ReqQueueEntry, 
-                      coalEntryT: ReqQueueEntry,
-                      respNonCoalEntryT: RespQueueEntry, 
-                      respCoalBundleT: CoalescedResponseBundle
+                      nonCoalEntryT: Request, 
+                      coalEntryT: Request,
+                      respNonCoalEntryT: Response, 
+                      respCoalBundleT: CoalescedResponse
      ) extends LazyModuleImp(outer){


@@ -1957,11 +2087,3 @@ class CoalescerXbarImpl(outer: CoalescerXbar,


  }
-
-
-
-
-
-
-
-
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -180,12 +180,12 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI
  )

  val reqQueueEnqReady =  peekIn(0).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType))
-  val reqQueueEnqBits =   peekIn(1).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(x.cloneType))
+  val reqQueueEnqBits =   peekIn(1).asInstanceOf[Seq[Request]].map(x => IO(x.cloneType))
  val reqQueueEnqValid =  peekIn(2).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType))
-  val reqQueueDeqBits =   peekIn(3).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(Output(x.cloneType)))
+  val reqQueueDeqBits =   peekIn(3).asInstanceOf[Seq[Request]].map(x => IO(Output(x.cloneType)))
  val reqQueueDeqValid =  peekIn(4).asInstanceOf[Seq[Bool]].map(x => IO(Output(x.cloneType)))
  val coalReqReady =      IO(Output(peekIn(5).asInstanceOf[Bool].cloneType))
-  val coalReqBits =       IO(Output(peekIn(6).asInstanceOf[ReqQueueEntry].cloneType))
+  val coalReqBits =       IO(Output(peekIn(6).asInstanceOf[Request].cloneType))
  val coalReqValid =      IO(Output(peekIn(7).asInstanceOf[Bool].cloneType))
  val coalInvalidate =    IO(Output(peekIn(8).asInstanceOf[Valid[Vec[UInt]]].cloneType))

@@ -759,14 +759,15 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
  }*/
 }

+class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
+  behavior of "uncoalescer"
  object uncoalescerTestConfig extends CoalescerConfig(
    enable = true,
    numLanes = 4,
    queueDepth = 2,
    waitTimeout = 8,
    addressWidth = 24,
-  dataBusWidth = 5,
-  // watermark = 2,
+    dataBusWidth = 4, // 128 bit data bus
    wordSizeInBytes = 4,
    numOldSrcIds = 16,
    numNewSrcIds = 4,
@@ -778,48 +779,50 @@ object uncoalescerTestConfig extends CoalescerConfig(
    bankStrideInBytes = 64,
  )

-class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
-  behavior of "uncoalescer"
-  val numLanes = 4
-  val numPerLaneReqs = 2
-  val sourceWidth = 2
-  val sizeWidth = 2
-  // 16B coalescing size
-  val coalDataWidth = 128
-  val numInflightCoalRequests = 4
+  val config = uncoalescerTestConfig
+
+  val nonCoalReqT = new NonCoalescedRequest(config)
+  val coalReqT = new CoalescedRequest(config)

  it should "work in general case" in {
-    test(new Uncoalescer(uncoalescerTestConfig))
+    test(new Uncoalescer(config, nonCoalReqT, coalReqT))
    // vcs helps with simulation time, but sometimes errors with
    // "mutation occurred during iteration" java error
    // .withAnnotations(Seq(VcsBackendAnnotation))
    { c =>
+      // 4 lanes, queue depth 2
+      c.io.windowElts(0)(0).op.poke(0.U)
+      c.io.windowElts(0)(0).source.poke(1.U)
+      c.io.windowElts(0)(0).address.poke(0x4.U)
+      c.io.windowElts(0)(0).size.poke(2.U)
+      c.io.windowElts(0)(1).op.poke(0.U)
+      c.io.windowElts(0)(1).source.poke(2.U)
+      c.io.windowElts(0)(1).address.poke(0x4.U) // two reqs from one lane
+      c.io.windowElts(0)(1).size.poke(2.U)
+      c.io.windowElts(2)(0).op.poke(0.U)
+      c.io.windowElts(2)(0).source.poke(2.U)
+      c.io.windowElts(2)(0).address.poke(0x8.U)
+      c.io.windowElts(2)(0).size.poke(2.U)
+      c.io.windowElts(2)(1).op.poke(0.U)
+      c.io.windowElts(2)(1).source.poke(2.U)
+      c.io.windowElts(2)(1).address.poke(0xc.U)
+      c.io.windowElts(2)(1).size.poke(2.U)
+      // indicate lane 0 and 2 are used for coalescing
+      c.io.invalidate.valid.poke(true.B)
+      c.io.invalidate.bits(0).poke(0x3.U) // 2'b11 for depth=2
+      c.io.invalidate.bits(1).poke(0x0.U)
+      c.io.invalidate.bits(2).poke(0x3.U)
+      c.io.invalidate.bits(3).poke(0x0.U)
+
      val sourceId = 0.U
-      val four = c.io.newEntry.sizeEnumT.FOUR
-      c.io.coalReqValid.poke(true.B)
-      c.io.newEntry.source.poke(sourceId)
-      c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(0).source.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(1).source.poke(2.U)
-      c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) // same offset to different lanes
-      c.io.newEntry.lanes(0).reqs(1).sizeEnum.poke(four)
-      c.io.newEntry.lanes(1).reqs(0).valid.poke(false.B)
-      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(2).reqs(1).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(1).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(1).offset.poke(3.U)
-      c.io.newEntry.lanes(2).reqs(1).sizeEnum.poke(four)
-      c.io.newEntry.lanes(3).reqs(0).valid.poke(false.B)
+      c.io.coalReq.valid.poke(true.B)
+      c.io.coalReq.bits.source.poke(sourceId)
+      c.io.coalReq.ready.expect(true.B)

      c.clock.step()

-      c.io.coalReqValid.poke(false.B)
+      c.io.coalReq.valid.poke(false.B)
+      c.io.invalidate.valid.poke(false.B)

      c.clock.step()

@@ -848,37 +851,42 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
  }

  it should "uncoalesce when coalesced to the same word offset" in {
-    test(new Uncoalescer(uncoalescerTestConfig))
+    test(new Uncoalescer(config, nonCoalReqT, coalReqT))
    // .withAnnotations(Seq(VcsBackendAnnotation))
    { c =>
+      // 4 lanes, queue depth 2
+      c.io.windowElts(0)(0).op.poke(0.U)
+      c.io.windowElts(0)(0).source.poke(0.U)
+      c.io.windowElts(0)(0).address.poke(0x4.U)
+      c.io.windowElts(0)(0).size.poke(2.U)
+      c.io.windowElts(1)(0).op.poke(0.U)
+      c.io.windowElts(1)(0).source.poke(1.U)
+      c.io.windowElts(1)(0).address.poke(0x4.U) // two reqs from one lane
+      c.io.windowElts(1)(0).size.poke(2.U)
+      c.io.windowElts(2)(0).op.poke(0.U)
+      c.io.windowElts(2)(0).source.poke(2.U)
+      c.io.windowElts(2)(0).address.poke(0x4.U)
+      c.io.windowElts(2)(0).size.poke(2.U)
+      c.io.windowElts(3)(0).op.poke(0.U)
+      c.io.windowElts(3)(0).source.poke(3.U)
+      c.io.windowElts(3)(0).address.poke(0x4.U)
+      c.io.windowElts(3)(0).size.poke(2.U)
+      // indicate lanes used for coalescing
+      c.io.invalidate.valid.poke(true.B)
+      c.io.invalidate.bits(0).poke(0x1.U) // 2'b01 for enabling head
+      c.io.invalidate.bits(1).poke(0x1.U)
+      c.io.invalidate.bits(2).poke(0x1.U)
+      c.io.invalidate.bits(3).poke(0x1.U)
+
      val sourceId = 0.U
-      val four = c.io.newEntry.sizeEnumT.FOUR
-      c.io.coalReqValid.poke(true.B)
-      c.io.newEntry.source.poke(sourceId)
-      c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(0).reqs(0).source.poke(0.U)
-      c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(0).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(1).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(1).reqs(0).source.poke(1.U)
-      c.io.newEntry.lanes(1).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(1).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(1).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
-      c.io.newEntry.lanes(2).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(2).reqs(1).valid.poke(false.B)
-      c.io.newEntry.lanes(3).reqs(0).valid.poke(true.B)
-      c.io.newEntry.lanes(3).reqs(0).source.poke(3.U)
-      c.io.newEntry.lanes(3).reqs(0).offset.poke(1.U)
-      c.io.newEntry.lanes(3).reqs(0).sizeEnum.poke(four)
-      c.io.newEntry.lanes(3).reqs(1).valid.poke(false.B)
+      c.io.coalReq.valid.poke(true.B)
+      c.io.coalReq.bits.source.poke(sourceId)
+      c.io.coalReq.ready.expect(true.B)

      c.clock.step()

-      c.io.coalReqValid.poke(false.B)
+      c.io.coalReq.valid.poke(false.B)
+      c.io.invalidate.valid.poke(false.B)

      c.clock.step()

@@ -908,138 +916,3 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
    }
  }
 }
-
-class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester {
-  behavior of "inflight coalesced request table"
-  val numLanes = 4
-  val numPerLaneReqs = 2
-  val sourceWidth = 2
-  val entries = 4
-
-  val offsetBits = 4
-  val sizeBits = 2
-
-  val inflightCoalReqTableEntry =
-    new InflightCoalReqTableEntry(
-      numLanes,
-      numPerLaneReqs,
-      sourceWidth,
-      offsetBits,
-      testConfig.sizeEnum
-    )
-
-  // it should "stop enqueueing when full" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries)) { c =>
-  //     // fill up the table
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(sourceId.U)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //       c.io.lookup.ready.poke(false.B)
-  //       c.clock.step()
-  //     }
-
-  //     // now cannot enqueue any more
-  //     c.io.enq.ready.expect(false.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(0.U)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //     c.clock.step()
-  //     c.io.enq.ready.expect(false.B)
-
-  //     // try to lookup all existing entries
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.valid.poke(false.B)
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(sourceId)
-  //       c.io.lookup.valid.expect(true.B)
-  //       c.io.lookup.bits.expect(sourceId)
-  //       c.clock.step()
-  //     }
-
-  //     // now the table should be empty
-  //     for (i <- 0 until entries) {
-  //       val sourceId = i
-  //       c.io.enq.valid.poke(false.B)
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(sourceId)
-  //       c.io.lookup.valid.expect(false.B)
-  //       c.clock.step()
-  //     }
-  //   }
-  // }
-  // it should "lookup matching entry" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries))
-  //     .withAnnotations(Seq(WriteVcdAnnotation)) { c =>
-  //       c.reset.poke(true.B)
-  //       c.clock.step(10)
-  //       c.reset.poke(false.B)
-
-  //       // enqueue one entry to not match at 0th index
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(0.U)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //       c.clock.step()
-
-  //       val targetSourceId = 1.U
-  //       c.io.enq.ready.expect(true.B)
-  //       c.io.enq.valid.poke(true.B)
-  //       c.io.enq.bits.fromLane.poke(0.U)
-  //       c.io.enq.bits.respSourceId.poke(targetSourceId)
-  //       c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-
-  //       c.clock.step()
-
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(targetSourceId)
-  //       c.io.lookup.valid.expect(true.B)
-  //       c.io.lookup.bits.expect(targetSourceId)
-
-  //       c.clock.step()
-
-  //       // test if matching entry dequeues after 1 cycle
-  //       c.io.lookup.ready.poke(true.B)
-  //       c.io.lookupSourceId.poke(targetSourceId)
-  //       c.io.lookup.valid.expect(false.B)
-  //     }
-  // }
-  // it should "handle lookup and enqueue at the same time" in {
-  //   test(new InflightCoalReqTable(numLanes, sourceWidth, entries)) { c =>
-  //     // fill up the table
-  //     val targetSourceId = 1.U
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(0.U)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.clock.step()
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(targetSourceId)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.clock.step()
-
-  //     // do both enqueue and lookup at the same cycle
-  //     val enqSourceId = 2.U
-  //     c.io.enq.ready.expect(true.B)
-  //     c.io.enq.valid.poke(true.B)
-  //     c.io.enq.bits.fromLane.poke(0.U)
-  //     c.io.enq.bits.respSourceId.poke(enqSourceId)
-  //     c.io.enq.bits.reqSourceIds.foreach { id => id.poke(0.U) }
-  //     c.io.lookup.ready.poke(true.B)
-  //     c.io.lookupSourceId.poke(targetSourceId)
-
-  //     c.clock.step()
-  //   }
-  // }
-}