Merge branch 'graphics' of https://github.com/hansungk/rocket-chip into graphics

2023-04-28 20:51:08 -07:00
parent c655874470 fec788d648
commit 12d2912368
2 changed files with 474 additions and 224 deletions
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -36,29 +36,29 @@ object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum {
 }

 case class CoalescerConfig(
-  numLanes: Int,        // number of lanes (or threads) in a warp
-  maxSize: Int,         // maximum burst size (64 bytes)
-  queueDepth: Int,      // request window per lane
-  waitTimeout: Int,     // max cycles to wait before forced fifo dequeue, per lane
-  addressWidth: Int,    // assume <= 32
-  dataBusWidth: Int,    // memory-side downstream TileLink data bus size
-                        // this has to be at least larger than the word size for
-                        // the coalescer to perform well
-  // watermark = 2,     // minimum buffer occupancy to start coalescing
-  wordSizeInBytes: Int, // 32-bit system
-  wordWidth: Int,       // log(WORD_SIZE)
-  numOldSrcIds: Int,    // num of outstanding requests per lane, from processor
-  numNewSrcIds: Int,    // num of outstanding coalesced requests
-  respQueueDepth: Int,  // depth of the response fifo queues
-  coalSizes: Seq[Int],  // list of coalescer sizes to try in the MonoCoalescers
-                        // must be power of 2's
-  sizeEnum: InFlightTableSizeEnum
-)
+  numLanes: Int,          // number of lanes (or threads) in a warp
+  queueDepth: Int,        // request window per lane
+  waitTimeout: Int,       // max cycles to wait before forced fifo dequeue, per lane
+  addressWidth: Int,      // assume <= 32
+  dataBusWidth: Int,      // memory-side downstream TileLink data bus size
+                          // this has to be at least larger than the word size for
+                          // the coalescer to perform well
+  // watermark = 2,       // minimum buffer occupancy to start coalescing
+  wordSizeInBytes: Int,   // 32-bit system
+  wordWidth: Int,         // log(WORD_SIZE)
+  numOldSrcIds: Int,      // num of outstanding requests per lane, from processor
+  numNewSrcIds: Int,      // num of outstanding coalesced requests
+  respQueueDepth: Int,    // depth of the response fifo queues
+  coalLogSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers
+                          // each size is log(byteSize)
+  sizeEnum: InFlightTableSizeEnum,
+) {
+  // maximum coalesced size
+  def maxCoalLogSize: Int = coalLogSizes.max
+}

 object defaultConfig extends CoalescerConfig(
  numLanes = 4,
-  // TODO: bigger size
-  maxSize = 3,
  queueDepth = 1,
  waitTimeout = 8,
  addressWidth = 24,
@@ -69,7 +69,7 @@ object defaultConfig extends CoalescerConfig(
  numOldSrcIds = 16,
  numNewSrcIds = 4,
  respQueueDepth = 4,
-  coalSizes = Seq(3),
+  coalLogSizes = Seq(3),
  sizeEnum = DefaultInFlightTableSizeEnum
 )

@@ -153,10 +153,14 @@ class ReqSourceGen(sourceWidth: Int) extends Module {
 // A shift-register queue implementation that supports invalidating entries
 // and exposing queue contents as output IO. (TODO: support deadline)
 // Initially copied from freechips.rocketchip.util.ShiftQueue.
-// If `pipe` is true, support enqueueing to a full queue when also dequeueing.
+// The queue only shifts down when `allowShift` is given true.  Dequeueing
+// works normally, but if allowShift was false, the queue head will stay
+// invalid after dequeueing.  This option is added in order to synchronize the
+// shifting of the queues between lanes to model the SIMD behavior.
+// If `pipe` is true, support enqueueing to a full queue when head is being
+// dequeued at the next cycle.
 // Software model: window.py
-class CoalShiftQueue[T <: Data](
-                                 gen: T,
+class CoalShiftQueue[T <: Data]( gen: T,
                                 val entries: Int,
                                 pipe: Boolean = true,
                                 flow: Boolean = false
@@ -164,6 +168,7 @@ class CoalShiftQueue[T <: Data](
  val io = IO(new Bundle {
    val queue = new QueueIO(gen, entries)
    val invalidate = Input(Valid(UInt(entries.W)))
+    val allowShift = Input(Bool())
    val mask = Output(UInt(entries.W))
    val elts = Output(Vec(entries, gen))
    // 'QueueIO' provides io.count, but we might not want to use it in the
@@ -192,7 +197,7 @@ class CoalShiftQueue[T <: Data](
  def paddedUsed = pad({ i: Int => used(i) })
  def validAfterInv(i: Int) = valid(i) && (!io.invalidate.valid || !io.invalidate.bits(i))

-  val shift = (used =/= 0.U) && (io.queue.deq.ready || !validAfterInv(0))
+  val shift = io.allowShift && (used =/= 0.U) && (io.queue.deq.fire || !validAfterInv(0))
  for (i <- 0 until entries) {
    val wdata = if (i == entries - 1) io.queue.enq.bits else Mux(!used(i + 1), io.queue.enq.bits, elts(i + 1))
    val wen = Mux(
@@ -208,27 +213,28 @@ class CoalShiftQueue[T <: Data](
      (io.queue.enq.fire && !paddedUsed(i + 1) && used(i)) || pad(validAfterInv)(i + 1),
      (io.queue.enq.fire && paddedUsed(i - 1) && !used(i)) || validAfterInv(i)
    )
+    // additionally, head entry should get invalidated when dequeue fired
+    // but queue didn't shift (e.g. because allowShift was false)
+    when (io.queue.deq.fire && !shift) {
+      valid(0) := false.B
+    }
  }

  when(io.queue.enq.fire) {
-    when(!io.queue.deq.fire) {
+    when(!shift) {
      used := (used << 1.U) | 1.U
    }
-  }.elsewhen(io.queue.deq.fire) {
+  }.elsewhen(shift) {
    used := used >> 1.U
  }

  io.queue.enq.ready := !valid(entries - 1)
-  // We don't want to invalidate deq.valid response right away even when
-  // io.invalidate(head) is true.
-  // Coalescing unit consumes queue head's validity, and produces its new
-  // validity.  Deasserting deq.valid right away will result in a combinational
-  // cycle.
-  io.queue.deq.valid := valid(0)
+  io.queue.deq.valid := validAfterInv(0)
  io.queue.deq.bits := elts.head

  assert(!flow, "flow-through is not implemented")
  if (flow) {
+    // FIXME old code
    when(io.queue.enq.valid) { io.queue.deq.valid := true.B }
    when(!valid(0)) { io.queue.deq.bits := io.queue.enq.bits }
  }
@@ -243,7 +249,7 @@ class CoalShiftQueue[T <: Data](
 }

 // Software model: coalescer.py
-class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
+class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
                    config: CoalescerConfig) extends Module {
  val io = IO(new Bundle {
    val window = Input(Vec(config.numLanes, windowT.io.cloneType))
@@ -251,8 +257,10 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
      val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W))
      val baseAddr = Output(UInt(config.addressWidth.W))
      val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
-      val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth).W))
-      val coverageHits = Output(UInt((1 << config.maxSize).W))
+      // number of entries matched with this leader lane's head.
+      // maximum is numLanes * queueDepth
+      val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
+      val coverageHits = Output(UInt((1 << config.maxCoalLogSize).W))
    })
  })

@@ -277,14 +285,12 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
        leadersValid(i), head.source, head.address)
    }
  }
-
-  // debug assertions and prints
  when (leadersValid.reduce(_ || _)) {
    assert(testNoQueueDrift, "unexpected drift between lane request queues")
-    printQueueHeads
+    // printQueueHeads
  }

-  val size = coalSize
+  val size = coalLogSize
  val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
  def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = {
    (req0.op === req1.op) &&
@@ -294,18 +300,24 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],

  // Gives a 2-D table of Bools representing match at every queue entry,
  // for each lane (so 3-D in total).
-  val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) =>
-    // TODO: match leader to only lanes >= leader idx
-    io.window.map { followerLane =>
-      // compare leader's head against follower's every queue entry
-      (followerLane.elts zip followerLane.mask.asBools).map { case (follower, followerValid) =>
-        canMatch(follower, followerValid, leader, leaderValid)
+  val matchTablePerLane = (leaders zip leadersValid).zipWithIndex
+    .map { case ((leader, leaderValid), leaderIndex) =>
+      io.window.zipWithIndex.map { case (followerQueue, followerIndex) =>
+        // compare leader's head against follower's every queue entry
+        (followerQueue.elts zip followerQueue.mask.asBools)
+          .map { case (follower, followerValid) =>
+            // match leader to only followers at lanes >= leader idx
+            // this halves the number of comparators
+            if (followerIndex < leaderIndex) false.B
+            else canMatch(follower, followerValid, leader, leaderValid)
+          }
      }
    }
-  }

  // TODO: potentially expensive: popcount & adder
-  val matchCounts = matchTablePerLane.map(leader => leader.map(PopCount(_)).reduce(_ +& _))
+  val matchCounts = matchTablePerLane.map(table =>
+      table.map(PopCount(_)) // sum up each column
+           .reduce(_ +& _))
  val canCoalesce = matchCounts.map(_ > 1.U)

  // TODO: potentially expensive
@@ -323,6 +335,18 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
  })(chosenLeaderIdx)
  val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)

+  // coverage calculation
+  def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth)
+  // 2-D table flattened to 1-D
+  val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address)))
+  val valids = io.window.map(_.mask).flatMap(_.asBools)
+  // indicates whether each word in the coalesced chunk is accessed by any of the
+  // queue entries. e.g. if [ 1 1 1 1 ], all of the four words in the coalesced
+  // data has been accessed and we've reached 100% utilization.
+  val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target =>
+    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+  }
+
  // debug prints
  when (leadersValid.reduce(_ || _)) {
    matchCounts.zipWithIndex.foreach { case (count, i) =>
@@ -334,14 +358,13 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
      printf("%d ", m)
    }
    printf("]\n")
-  }
+    printf("chosenMatchCount = %d\n", chosenMatchCount)

-  // coverage calculation
-  def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth)
-  val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address)))
-  val valids = io.window.map(_.mask).flatMap(_.asBools)
-  val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target =>
-    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+    printf("hits = [ ")
+    hits.foreach { m =>
+      printf("%d ", m)
+    }
+    printf("]\n")
  }

  io.results.leaderIdx := chosenLeaderIdx
@@ -354,19 +377,21 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
 // Software model: coalescer.py
 class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry,
                     config: CoalescerConfig) extends Module {
-
  val io = IO(new Bundle {
+    // coalescing window, connected to the contents of the request queues
    val window = Input(Vec(config.numLanes, windowT.io.cloneType))
-    val outReq = DecoupledIO(coalReqT.cloneType)
+    // generated coalesced request
+    val coalReq = DecoupledIO(coalReqT.cloneType)
+    // invalidate signals going into each request queue's head
    val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
  })

-  val coalescers = config.coalSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
+  val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
  coalescers.foreach(_.io.window := io.window)

-  def normalize(x: Seq[UInt]): Seq[UInt] = {
-    x.zip(config.coalSizes).map { case (hits, size) =>
-      (hits << (config.maxSize - size).U).asUInt
+  def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
+    (valPerSize zip config.coalLogSizes).map { case (hits, size) =>
+      (hits << (config.maxCoalLogSize - size).U).asUInt
    }
  }

@@ -378,27 +403,40 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
    }._2
  }

+  // normalize to maximum coalescing size so that we can do fair comparisons
+  // between coalescing results of different sizes
  val normalizedMatches = normalize(coalescers.map(_.io.results.matchCount))
  val normalizedHits = normalize(coalescers.map(_.io.results.coverageHits))

-  val chosenIdx = Wire(UInt(log2Ceil(config.coalSizes.size).W))
+  val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
  val chosenValid = Wire(Bool())
  // minimum 25% coverage
-  val minCoverage = 1.max(1 << (config.maxSize - 4))
+  val minCoverage = 1.max(1 << ((config.maxCoalLogSize - 2) - 2))
+
  when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
-    chosenIdx := argMax(normalizedHits)
+    chosenSizeIdx := argMax(normalizedHits)
    chosenValid := true.B
+    printf("coalescing success by coverage policy\n")
  }.elsewhen(normalizedMatches.map(_ > 1.U).reduce(_ || _)) {
-    chosenIdx := argMax(normalizedMatches)
+    chosenSizeIdx := argMax(normalizedMatches)
    chosenValid := true.B
+    printf("coalescing success by matches policy\n")
  }.otherwise {
-    chosenIdx := DontCare
+    chosenSizeIdx := DontCare
    chosenValid := false.B
  }

+  def debugPolicyPrint() = {
+    printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount)
+    printf("normalizedMatches[0]=%d\n", normalizedMatches(0))
+    printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits)
+    printf("normalizedHits[0]=%d\n", normalizedHits(0))
+    printf("minCoverage=%d\n", minCoverage.U)
+  }
+
  // create coalesced request
-  val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenIdx)
-  val chosenSize = VecInit(coalescers.map(_.size.U))(chosenIdx)
+  val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenSizeIdx)
+  val chosenSize = VecInit(coalescers.map(_.size.U))(chosenSizeIdx)

  // flatten requests and matches
  val flatReqs = io.window.flatMap(_.elts)
@@ -411,8 +449,8 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE

  // note: this is word-level coalescing. if finer granularity is needed, need to modify code
  val numWords = (1.U << (chosenSize - config.wordWidth.U)).asUInt
-  val maxWords = 1 << (config.maxSize - config.wordWidth)
-  val addrMask = Wire(UInt(config.maxSize.W))
+  val maxWords = 1 << (config.maxCoalLogSize - config.wordWidth)
+  val addrMask = Wire(UInt(config.maxCoalLogSize.W))
  addrMask := (1.U << chosenSize).asUInt - 1.U

  val data = Wire(Vec(maxWords, UInt((config.wordSizeInBytes * 8).W)))
@@ -420,7 +458,7 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE

  for (i <- 0 until maxWords) {
    val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
-      m && ((req.address(config.maxSize - 1, 0) & addrMask) === i.U)
+      m && ((req.address(config.maxCoalLogSize - 1, 0) & addrMask) === i.U)
    }
    // TODO: SW uses priority encoder, not sure about behavior of MuxCase
    data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) =>
@@ -435,18 +473,20 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
  }

  val sourceGen = Module(new ReqSourceGen(log2Ceil(config.numNewSrcIds)))
-  sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created
+  sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created

-  io.outReq.bits.source := sourceGen.io.id.bits
-  io.outReq.bits.mask := mask.asUInt
-  io.outReq.bits.data := data.asUInt
-  io.outReq.bits.size := chosenSize
-  io.outReq.bits.address := chosenBundle.baseAddr
-  io.outReq.bits.op := VecInit(io.window.map(_.elts.head))(chosenBundle.leaderIdx).op
-  io.outReq.valid := chosenValid && sourceGen.io.id.valid
+  val coalesceValid = chosenValid && sourceGen.io.id.valid
+
+  io.coalReq.bits.source := sourceGen.io.id.bits
+  io.coalReq.bits.mask := mask.asUInt
+  io.coalReq.bits.data := data.asUInt
+  io.coalReq.bits.size := chosenSize
+  io.coalReq.bits.address := chosenBundle.baseAddr
+  io.coalReq.bits.op := VecInit(io.window.map(_.elts.head))(chosenBundle.leaderIdx).op
+  io.coalReq.valid := coalesceValid

  io.invalidate.bits := chosenBundle.matchOH
-  io.invalidate.valid := io.outReq.fire // invalidate only when fire
+  io.invalidate.valid := io.coalReq.fire // invalidate only when fire

  dontTouch(io.invalidate) // debug

@@ -471,7 +511,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
    Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth))
  }

-  val coalReqT = new ReqQueueEntry(sourceWidth, log2Ceil(config.maxSize), config.addressWidth, config.maxSize)
+  val coalReqT = new ReqQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize), config.addressWidth, config.maxCoalLogSize)
  val coalescer = Module(new MultiCoalescer(reqQueues.head, coalReqT, config))
  coalescer.io.window := reqQueues.map(_.io)

@@ -511,20 +551,26 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
      assert(reqQueue.io.queue.enq.ready, "reqQueue is supposed to be always ready")
      reqQueue.io.queue.enq.valid := tlIn.a.valid
      reqQueue.io.queue.enq.bits := req
-      // TODO: deq.ready should respect downstream ready
+      // TODO: deq.ready should respect downstream arbiter
      reqQueue.io.queue.deq.ready := true.B
+      // invalidate queue entries that contain original core requests that got
+      // coalesced into a wider one
      reqQueue.io.invalidate.bits := coalescer.io.invalidate.bits(lane)
      reqQueue.io.invalidate.valid := coalescer.io.invalidate.valid
+      reqQueue.io.allowShift := true.B

+      // NOTE: this relies on CoalShiftQueue's behavior combinationally
+      // deasserting deq.valid in the same cycle that the head invalidate
+      // signal goes up.
      tlOut.a.valid := reqQueue.io.queue.deq.valid
      tlOut.a.bits := reqQueue.io.queue.deq.bits.toTLA(edgeOut)
  }

  val (tlCoal, edgeCoal) = outer.coalescerNode.out(0)

-  tlCoal.a.valid := coalescer.io.outReq.valid
-  tlCoal.a.bits := coalescer.io.outReq.bits.toTLA(edgeCoal)
-  coalescer.io.outReq.ready := tlCoal.a.ready
+  tlCoal.a.valid := coalescer.io.coalReq.valid
+  tlCoal.a.bits := coalescer.io.coalReq.bits.toTLA(edgeCoal)
+  coalescer.io.coalReq.ready := tlCoal.a.ready
  tlCoal.b.ready := true.B
  tlCoal.c.valid := false.B
  // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
@@ -541,7 +587,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  // coalesced request.  Upper bound is min(DEPTH, 2**sourceWidth).
  val numPerLaneReqs = config.queueDepth

-  val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxSize), config.maxSize)
+  val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize), config.maxCoalLogSize)
  val respQueues = Seq.tabulate(config.numLanes) { _ =>
    Module(
      new MultiPortQueue(
@@ -550,6 +596,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
        // requests that didn't get coalesced, and M is the maximum number of
        // single-lane requests that can go into a coalesced request.
        // (`numPerLaneReqs`).
+        // TODO: potentially expensive, because this generates more FFs.
+        // Rather than enqueueing all responses in a single cycle, consider
+        // enqueueing one by one (at the cost of possibly stalling downstream).
        1 + numPerLaneReqs,
        // deq_lanes = 1 because we're serializing all responses to 1 port that
        // goes back to the core.
@@ -566,7 +615,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
    )
  }
  val respQueueNoncoalPort = 0
-  val respQueueCoalPortOffset = 1
+  val respQueueUncoalPortOffset = 1

  (outer.node.in zip outer.node.out).zipWithIndex.foreach {
    case (((tlIn, edgeIn), (tlOut, _)), 0) => // TODO: not necessarily 1 master edge
@@ -645,51 +694,40 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  // logic to generate the Inflight Entry into the uncoalescer, where it should be.
  // this also reduces top level clutter.

-  val offsetBits = 4 // FIXME hardcoded
-  // but the width of the size enum
-  val newEntry = Wire(
-    new InflightCoalReqTableEntry(
-      config.numLanes,
-      numPerLaneReqs,
-      sourceWidth,
-      offsetBits,
-      config.sizeEnum
-    )
-  )
-  println(s"=========== table sourceWidth: ${sourceWidth}")
-  // println(s"=========== table sizeEnumBits: ${newEntry.sizeEnumBits}")
-  newEntry.source := coalescer.io.outReq.bits.source
+  val uncoalescer = Module(new Uncoalescer(config))
+
+  val newEntry = Wire(uncoalescer.inflightTable.entryT)
+  newEntry.source := coalescer.io.coalReq.bits.source

  // TODO: richard to write table fill logic
-  // FIXME: this assertion used to say 1 << config.MAX_SIZE
-  // I changed this to say DATA BUS SIZE. We need another assertion
-  // to assert that MAX_SIZE is <= DATA_BUS_SIZE because we do not support
-  // multi-beat writes currently
-  assert(
+  assert (config.maxCoalLogSize <= config.dataBusWidth,
+    "multi-beat coalesced reads/writes are currently not supported")
+  assert (
    tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
-    s"tlCoal param dataBits (${tlCoal.params.dataBits}) mismatch coalescer constant"
+    s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
    + s" (${(1 << config.dataBusWidth) * 8})"
  )
-  val origReqs = reqQueues.map(q => q.io.queue.deq.bits)
-  newEntry.lanes.foreach { l =>
-    l.reqs.zipWithIndex.foreach { case (r, i) =>
-      // TODO: this part needs the actual coalescing logic to work
-      r.valid := false.B
-      r.source := origReqs(i).source
-      r.offset := (origReqs(i).address % (1 << config.maxSize).U) >> config.wordWidth
-      r.sizeEnum := config.sizeEnum.logSizeToEnum(origReqs(i).size)
+  val reqQueueHeads = reqQueues.map(q => q.io.queue.deq.bits)
+  // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
+  // coalescer to every (numLanes * queueDepth) entry in the inflight table.
+  (newEntry.lanes zip coalescer.io.invalidate.bits).zipWithIndex
+    .foreach { case ((laneEntry, laneInv), lane) =>
+      (laneEntry.reqs zip laneInv.asBools).zipWithIndex
+        .foreach { case ((reqEntry, inv), i) =>
+          val req = reqQueues(lane).io.elts(i)
+          when ((coalescer.io.invalidate.valid && inv)) {
+            printf(s"coalescer: reqQueue(${lane})(${i}) got invalidated (source=%d)\n", req.source)
+          }
+          reqEntry.valid := (coalescer.io.invalidate.valid && inv)
+          reqEntry.source := req.source
+          reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordWidth)
+          reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
+          // TODO: load/store op
+        }
    }
-  }
-  newEntry.lanes(0).reqs(0).valid := true.B
-  newEntry.lanes(1).reqs(0).valid := true.B
-  newEntry.lanes(2).reqs(0).valid := true.B
-  newEntry.lanes(3).reqs(0).valid := true.B
  dontTouch(newEntry)

-  // Uncoalescer module uncoalesces responses back to each lane
-  val uncoalescer = Module(new UncoalescingUnit(config))
-
-  uncoalescer.io.coalReqValid := coalescer.io.outReq.valid
+  uncoalescer.io.coalReqValid := coalescer.io.coalReq.valid
  uncoalescer.io.newEntry := newEntry
  // Cleanup: custom <>?
  uncoalescer.io.coalResp.valid := tlCoal.d.valid
@@ -698,22 +736,26 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  tlCoal.d.ready := uncoalescer.io.coalResp.ready

  // Queue up synthesized uncoalesced responses into each lane's response queue
-  (respQueues zip uncoalescer.io.uncoalResps).foreach { case (q, lanes) =>
-    lanes.zipWithIndex.foreach { case (resp, i) =>
+  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) =>
+    perLaneResps.zipWithIndex.foreach { case (resp, i) =>
      // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
      // cache.  This should ideally not happen though.
      assert(
-        q.io.enq(respQueueCoalPortOffset + i).ready,
-        s"respQueue: enq port for 0-th coalesced response is blocked"
+        q.io.enq(respQueueUncoalPortOffset + i).ready,
+        s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
      )
-      q.io.enq(respQueueCoalPortOffset + i).valid := resp.valid
-      q.io.enq(respQueueCoalPortOffset + i).bits := resp.bits
+      q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
+      q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
+      // debug
+      // when (resp.valid) {
+      //   printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
+      // }
      // dontTouch(q.io.enq(respQueueCoalPortOffset))
    }
  }

  // Debug
-  dontTouch(coalescer.io.outReq)
+  dontTouch(coalescer.io.coalReq)
  val coalRespData = tlCoal.d.bits.data
  dontTouch(coalRespData)

@@ -730,10 +772,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
 // FIXME: overlaps with RespQueueEntry. Trait-ify
 class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle {
  val source = UInt(log2Ceil(config.numNewSrcIds).W)
-  val data = UInt((8 * (1 << config.maxSize)).W)
+  val data = UInt((8 * (1 << config.maxCoalLogSize)).W)
 }

-class UncoalescingUnit(config: CoalescerConfig) extends Module {
+class Uncoalescer(config: CoalescerConfig) extends Module {
  // notes to hansung:
  //  val numLanes: Int, <-> config.NUM_LANES
  //  val numPerLaneReqs: Int, <-> config.DEPTH
@@ -833,19 +875,21 @@ class UncoalescingUnit(config: CoalescerConfig) extends Module {
 // split the coalesced response back to individual per-lane responses with the
 // right metadata.
 class InflightCoalReqTable(config: CoalescerConfig) extends Module {
-  val offsetBits = 4 // FIXME hardcoded
-  val sizeBits = 2 // FIXME hardcoded
+  val offsetBits = config.maxCoalLogSize - config.wordWidth // assumes word offset
  val entryT = new InflightCoalReqTableEntry(
    config.numLanes,
    config.queueDepth,
    log2Ceil(config.numOldSrcIds),
-    config.maxSize,
+    config.maxCoalLogSize,
    config.sizeEnum
  )

  val entries = config.numNewSrcIds
  val sourceWidth = log2Ceil(config.numOldSrcIds)

+  println(s"=========== table sourceWidth: ${sourceWidth}")
+  println(s"=========== table sizeEnumBits: ${entryT.sizeEnumT.getWidth}")
+
  val io = IO(new Bundle {
    val enq = Flipped(Decoupled(entryT))
    // TODO: return actual stuff
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -35,26 +35,46 @@ class MultiPortQueueUnitTest extends AnyFlatSpec with ChiselScalatestTester {

 class DummyCoalescingUnitTB(implicit p: Parameters) extends LazyModule {
  val cpuNodes = Seq.tabulate(testConfig.numLanes) { _ =>
-    TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLClientParameters(
-      name = "processor-nodes",
-      sourceId = IdRange(0, testConfig.numOldSrcIds),
-//      requestFifo = true,
-      visibility = Seq(AddressSet(0x0, 0xffffff))))))) // 24 bit address space (TODO probably use testConfig)
+    TLClientNode(
+      Seq(
+        TLMasterPortParameters.v1(
+          Seq(
+            TLClientParameters(
+              name = "processor-nodes",
+              sourceId = IdRange(0, testConfig.numOldSrcIds),
+              visibility = Seq(AddressSet(0x0, 0xffffff))
+            )
+          )
+        )
+      )
+    ) // 24 bit address space (TODO probably use testConfig)
  }

  val device = new SimpleDevice("dummy", Seq("dummy"))
  val beatBytes = 1 << testConfig.dataBusWidth // 256 bit bus
  val l2Nodes = Seq.tabulate(5) { _ =>
-    TLManagerNode(Seq(TLSlavePortParameters.v1(Seq(TLManagerParameters(
-      address = Seq(AddressSet(0x0, 0xffffff)), // should be matching cpuNode
-      resources = device.reg,
-      regionType = RegionType.UNCACHED,
-      executable = true,
-      supportsGet = TransferSizes(1, beatBytes),
-      supportsPutFull = TransferSizes(1, beatBytes),
-      supportsPutPartial = TransferSizes(1, beatBytes),
-      supportsHint = TransferSizes(1, beatBytes),
-      fifoId = Some(0))), beatBytes)))
+    TLManagerNode(
+      Seq(
+        TLSlavePortParameters.v1(
+          Seq(
+            TLManagerParameters(
+              address = Seq(AddressSet(0x0, 0xffffff)), // should be matching cpuNode
+              resources = device.reg,
+              regionType = RegionType.UNCACHED,
+              executable = true,
+              supportsArithmetic = TransferSizes(1, beatBytes),
+              supportsLogical = TransferSizes(1, beatBytes),
+              supportsGet = TransferSizes(1, beatBytes),
+              supportsPutFull = TransferSizes(1, beatBytes),
+              supportsPutPartial = TransferSizes(1, beatBytes),
+              supportsHint = TransferSizes(1, beatBytes),
+              fifoId = Some(0)
+            )
+          ),
+          beatBytes
+        )
+      )
+    )
  }

  val dut = LazyModule(new CoalescingUnit(testConfig))
@@ -81,84 +101,116 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI
 //  val coalMasterNode = coal.coalescerNode.makeIOs()
 }

+object testConfig extends CoalescerConfig(
+  numLanes = 4,
+  queueDepth = 1,
+  waitTimeout = 8,
+  addressWidth = 24,
+  dataBusWidth = 5,
+  // watermark = 2,
+  wordSizeInBytes = 4,
+  wordWidth = 2,
+  numOldSrcIds = 16,
+  numNewSrcIds = 4,
+  respQueueDepth = 4,
+  coalLogSizes = Seq(3),
+  sizeEnum = DefaultInFlightTableSizeEnum
+)
+
 class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
  behavior of "multi- and mono-coalescers"

-  it should "coalesce fully consecutive accesses at size 4, only once" in {
-    implicit val p: Parameters = Parameters.empty
+  implicit val p: Parameters = Parameters.empty

-    val tb = LazyModule(new DummyCoalescingUnitTB())
-//    val outer = LazyModule(new CoalescingUnit(testConfig))
-
-    val coal = tb.dut
-
-    test(tb.module).withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c =>
-      val nodes = c.coalIOs.map(_.head)
-//      val nodes = c.cpuNodesImp.map(_.out.head._1)
-//      val nodes = c.coal.node.in.map(_._1)
-//      val nodes = c.mitmNodesImp.map(_.in.head._1)
-
-      def pokeA(nodes: Seq[TLBundle], idx: Int, op: Int, size: Int, source: Int, addr: Int, mask: Int, data: Int): Unit = {
-        val node = nodes(idx)
+  def pokeA(
+      nodes: Seq[TLBundle],
+      idx: Int,
+      op: Int,
+      size: Int,
+      source: Int,
+      addr: Int,
+      mask: Int,
+      data: Int
+  ): Unit = {
+    val node = nodes(idx)
 //        node.a.ready.expect(true.B) // FIXME: this fails currently
-        node.a.bits.opcode.poke(if (op == 1) TLMessages.PutFullData else TLMessages.Get)
-        node.a.bits.param.poke(0.U)
-        node.a.bits.size.poke(size.U)
-        node.a.bits.source.poke(source.U)
-        node.a.bits.address.poke(addr.U)
-        node.a.bits.mask.poke(mask.U)
-        node.a.bits.data.poke(data.U)
-        node.a.bits.corrupt.poke(false.B)
-        node.a.valid.poke(true.B)
-      }
+    node.a.bits.opcode.poke(if (op == 1) TLMessages.PutFullData else TLMessages.Get)
+    node.a.bits.param.poke(0.U)
+    node.a.bits.size.poke(size.U)
+    node.a.bits.source.poke(source.U)
+    node.a.bits.address.poke(addr.U)
+    node.a.bits.mask.poke(mask.U)
+    node.a.bits.data.poke(data.U)
+    node.a.bits.corrupt.poke(false.B)
+    node.a.valid.poke(true.B)
+  }

-      def unsetA(): Unit = {
-        nodes.foreach { node =>
-          node.a.valid.poke(false.B)
-        }
-      }
+  def unsetA(nodes: Seq[TLBundle]): Unit = {
+    nodes.foreach { node =>
+      node.a.valid.poke(false.B)
+    }
+  }

-      // always ready to take coalesced requests
-//      c.coalMasterNode.head.a.ready.poke(true.B)
-//      c.coal.module.coalescer.io.outReq.ready.poke(true.B)
+  // it should "coalesce fully consecutive accesses at size 4, only once" in {
+  //   test(makeTb().module)
+  //   .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation))
+  //   { c =>
+  //     println(s"coalIO length = ${c.coalIOs(0).length}")
+  //     val nodes = c.coalIOs.map(_.head)
+// //      val nodes = c.cpuNodesImp.map(_.out.head._1)
+// //      val nodes = c.coal.node.in.map(_._1)
+// //      val nodes = c.mitmNodesImp.map(_.in.head._1)

-      pokeA(nodes, idx=0, op=1, size=2, source=0, addr=0x10, mask=0xf, data=0x1111)
-      pokeA(nodes, idx=1, op=1, size=2, source=0, addr=0x14, mask=0xf, data=0x2222)
-      pokeA(nodes, idx=2, op=1, size=2, source=0, addr=0x18, mask=0xf, data=0x3333)
-      pokeA(nodes, idx=3, op=1, size=2, source=0, addr=0x1c, mask=0xf, data=0x4444)
+  //     // always ready to take coalesced requests
+// //      c.coalMasterNode.head.a.ready.poke(true.B)
+// //      c.coal.module.coalescer.io.outReq.ready.poke(true.B)
+
+  //     pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x10, mask = 0xf, data = 0x1111)
+  //     pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x14, mask = 0xf, data = 0x2222)
+  //     pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333)
+  //     pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x1c, mask = 0xf, data = 0x4444)
+
+  //     c.clock.step()
+
+  //     unsetA(nodes)
+
+  //     c.clock.step()
+  //     c.clock.step()
+  //   }
+  // }
+
+  it should "coalesce identical addresses (stride of 0)" in {
+    test(LazyModule(new DummyCoalescingUnitTB()).module)
+    .withAnnotations(Seq(VcsBackendAnnotation))
+    { c =>
+      println(s"coalIO length = ${c.coalIOs(0).length}")
+      val nodes = c.coalIOs.map(_.head)
+
+      pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x1111)
+      pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x2222)
+      pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333)
+      pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x4444)

      c.clock.step()

-      unsetA()
+      unsetA(nodes)

      c.clock.step()
      c.clock.step()
    }
  }

-  it should "coalesce strided accesses at size 6" in {
+  it should "coalesce strided accesses at size 6" in {}

-  }
+  it should "coalesce the coalescable chunk and leave 2 uncoalescable requests" in {}

-  it should "coalesce the coalescable chunk and leave 2 uncoalescable requests" in {
+  it should "not touch uncoalescable requests" in {}

-  }
+  it should "allow temporal coalescing when depth >=2" in {}

-  it should "not touch uncoalescable requests" in {
+  it should "select the most coverage mono-coalescer" in {}

-  }
-
-  it should "allow temporal coalescing when depth >=2" in {
-
-  }
-
-  it should "select the most coverage mono-coalescer" in {
-
-  }
-
-  it should "resort to the backup policy when coverage is below average" in {
-
-  }
+  it should "resort to the backup policy when coverage is below average" in {}
 }

 class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
@@ -167,6 +219,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
  it should "work like normal shiftqueue when no invalidate" in {
    test(new CoalShiftQueue(UInt(8.W), 4)) { c =>
      c.io.queue.deq.ready.poke(false.B)
+      c.io.allowShift.poke(true.B)

      c.io.queue.enq.ready.expect(true.B)
      c.io.queue.enq.valid.poke(true.B)
@@ -215,6 +268,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
  it should "work when enqueing and dequeueing simultaneously" in {
    test(new CoalShiftQueue(UInt(8.W), 4)) { c =>
      c.io.invalidate.valid.poke(false.B)
+      c.io.allowShift.poke(true.B)

      // prepare
      c.io.queue.deq.ready.poke(true.B)
@@ -243,9 +297,47 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
    }
  }

-  it should "work when enqueing and dequeueing simultaneously to a full queue" in {
+  it should "not shift entries when allowShift is false" in {
+    test(new CoalShiftQueue(UInt(8.W), 4)) { c =>
+      c.io.invalidate.valid.poke(false.B)
+      c.io.queue.deq.ready.poke(false.B)
+
+      c.io.allowShift.poke(false.B)
+
+      // prepare
+      c.io.queue.enq.ready.expect(true.B)
+      c.io.queue.enq.valid.poke(true.B)
+      c.io.queue.enq.bits.poke(0x12.U)
+      c.clock.step()
+      c.io.queue.enq.ready.expect(true.B)
+      c.io.queue.enq.valid.poke(true.B)
+      c.io.queue.enq.bits.poke(0x34.U)
+      c.clock.step()
+      c.io.queue.enq.valid.poke(false.B)
+
+      // dequeueing should work normally when allowShift is false...
+      c.io.queue.deq.ready.poke(true.B)
+      c.io.queue.deq.valid.expect(true.B)
+      c.io.queue.deq.bits.expect(0x12.U)
+      c.clock.step()
+      // but should stop there and not dequeue the next entry
+      c.io.queue.deq.ready.poke(true.B)
+      c.io.queue.deq.valid.expect(false.B)
+      c.clock.step()
+      // when allowShift is back one, dequeueing should start working from next
+      // cycle
+      c.io.allowShift.poke(true.B)
+      c.clock.step()
+      c.io.queue.deq.ready.poke(true.B)
+      c.io.queue.deq.valid.expect(true.B)
+      c.io.queue.deq.bits.expect(0x34.U)
+    }
+  }
+
+  it should "work when enqueing and dequeueing simultaneously to a depth=1 queue" in {
    test(new CoalShiftQueue(UInt(8.W), 1)) { c =>
      c.io.invalidate.valid.poke(false.B)
+      c.io.allowShift.poke(true.B)

      // prepare
      c.io.queue.deq.ready.poke(true.B)
@@ -282,9 +374,47 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
    }
  }

-  it should "invalidate head being dequeued" in {
+  it should "work when invalidating and enqueueing to a depth=1 queue" in {
+    test(new CoalShiftQueue(UInt(8.W), 1)) { c =>
+      c.io.invalidate.valid.poke(false.B)
+      c.io.allowShift.poke(true.B)
+      // no dequeueing
+      c.io.queue.deq.ready.poke(false.B)
+
+      // prepare
+      c.io.queue.enq.ready.expect(true.B)
+      c.io.queue.enq.valid.poke(true.B)
+      c.io.queue.enq.bits.poke(0x12.U)
+      c.clock.step()
+      // invalidate, but don't allow shift
+      c.io.allowShift.poke(false.B)
+      c.io.invalidate.valid.poke(true.B)
+      c.io.invalidate.bits.poke(0x1.U)
+      // TODO: we might be able to enqueue to a full depth=1 queue whose only
+      // entry just got invalidated, so that enq.ready is true here, but
+      // it is a niche case
+      c.io.queue.enq.ready.expect(false.B)
+      c.clock.step()
+      // now try enqueueing now that we have space
+      c.io.allowShift.poke(true.B)
+      c.io.invalidate.valid.poke(false.B)
+      c.io.queue.enq.ready.expect(true.B)
+      c.io.queue.enq.valid.poke(true.B)
+      c.io.queue.enq.bits.poke(0x34.U)
+      c.io.queue.deq.valid.expect(false.B)
+      c.clock.step()
+      // see if it comes out right next cycle
+      c.io.queue.enq.valid.poke(false.B)
+      c.io.queue.deq.ready.poke(true.B)
+      c.io.queue.deq.valid.expect(true.B)
+      c.io.queue.deq.bits.expect(0x34.U)
+    }
+  }
+
+  it should "invalidate head that is also being dequeued" in {
    test(new CoalShiftQueue(UInt(8.W), 4)) { c =>
      c.io.invalidate.valid.poke(false.B)
+      c.io.allowShift.poke(true.B)

      // prepare
      c.io.queue.deq.ready.poke(false.B)
@@ -300,12 +430,11 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
      c.io.queue.enq.valid.poke(false.B)

      // invalidate should work for the head just being dequeued at the same
-      // cycle.  However, it should not change deq.valid right away to avoid
-      // combinational cycles (see definition).
+      // cycle
      c.io.invalidate.valid.poke(true.B)
      c.io.invalidate.bits.poke(0x1.U)
      c.io.queue.deq.ready.poke(true.B)
-      c.io.queue.deq.valid.expect(true.B)
+      c.io.queue.deq.valid.expect(false.B)
      c.clock.step()
      // 0x12 should have been dequeued
      c.io.invalidate.valid.poke(false.B)
@@ -315,10 +444,12 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
    }
  }

-  it should "dequeue invalidated entries by itself" in {
+  it should "dequeue invalidated head on its own when allowShift" in {
    test(new CoalShiftQueue(gen = UInt(8.W), entries = 4)) { c =>
      c.io.invalidate.valid.poke(false.B)

+      c.io.allowShift.poke(true.B)
+
      // prepare
      c.io.queue.deq.ready.poke(false.B)
      c.io.queue.enq.ready.expect(true.B)
@@ -338,19 +469,33 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
      // invalidate two entries at head
      c.io.invalidate.valid.poke(true.B)
      c.io.invalidate.bits.poke(0x3.U)
+      c.io.queue.deq.ready.poke(false.B)
      // [ 0x56 | 0x34(inv) | 0x12(inv) ]
      c.clock.step()
-      // [ 0x56 | 0x34(inv) ]
+      //             [ 0x56 | 0x34(inv) ]
      c.io.invalidate.valid.poke(false.B)
      c.io.queue.deq.ready.poke(false.B)
      c.clock.step()
-      // [ 0x56 ]
+      //                         [ 0x56 ]
      c.io.queue.deq.ready.poke(true.B)
      c.io.queue.deq.valid.expect(true.B)
      c.io.queue.deq.bits.expect(0x56.U)
      c.clock.step()
      c.io.queue.deq.ready.poke(true.B)
      c.io.queue.deq.valid.expect(false.B)
+      c.clock.step()
+
+      // do one more enqueue-then-dequeue to see if used bit was properly cleared
+      c.io.queue.deq.ready.poke(false.B)
+      c.io.queue.enq.ready.expect(true.B)
+      c.io.queue.enq.valid.poke(true.B)
+      c.io.queue.enq.bits.poke(0x78.U)
+      c.clock.step()
+      // should dequeue right away
+      c.io.queue.enq.valid.poke(false.B)
+      c.io.queue.deq.ready.poke(true.B)
+      c.io.queue.deq.valid.expect(true.B)
+      c.io.queue.deq.bits.expect(0x78.U)
    }
  }

@@ -358,6 +503,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
    test(new CoalShiftQueue(UInt(8.W), 4)) { c =>
      c.io.invalidate.valid.poke(false.B)
      c.io.invalidate.bits.poke(0.U)
+      c.io.allowShift.poke(true.B)

      // prepare
      c.io.queue.deq.ready.poke(false.B)
@@ -383,24 +529,23 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
  }
 }

-object testConfig extends CoalescerConfig(
-  maxSize = 5,
+object uncoalescerTestConfig extends CoalescerConfig(
+  numLanes = 4,
  queueDepth = 2,
  waitTimeout = 8,
  addressWidth = 24,
  dataBusWidth = 5,
-  numLanes = 4,
  // watermark = 2,
  wordSizeInBytes = 4,
  wordWidth = 2,
  numOldSrcIds = 16,
  numNewSrcIds = 4,
  respQueueDepth = 4,
-  coalSizes = Seq(4, 5),
+  coalLogSizes = Seq(4),
  sizeEnum = DefaultInFlightTableSizeEnum
 )

-class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester {
+class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
  behavior of "uncoalescer"
  val numLanes = 4
  val numPerLaneReqs = 2
@@ -410,8 +555,8 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester {
  val coalDataWidth = 128
  val numInflightCoalRequests = 4

-  it should "work" in {
-    test(new UncoalescingUnit(testConfig))
+  it should "work in general case" in {
+    test(new Uncoalescer(uncoalescerTestConfig))
    // vcs helps with simulation time, but sometimes errors with
    // "mutation occurred during iteration" java error
    // .withAnnotations(Seq(VcsBackendAnnotation))
@@ -426,7 +571,7 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester {
      c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
      c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B)
      c.io.newEntry.lanes(0).reqs(1).source.poke(2.U)
-      c.io.newEntry.lanes(0).reqs(1).offset.poke(0.U)
+      c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) // same offset to different lanes
      c.io.newEntry.lanes(0).reqs(1).sizeEnum.poke(four)
      c.io.newEntry.lanes(1).reqs(0).valid.poke(false.B)
      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
@@ -460,7 +605,7 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester {
      // offset is counting from LSB
      c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
      c.io.uncoalResps(0)(0).bits.source.expect(1.U)
-      c.io.uncoalResps(0)(1).bits.data.expect(0xdeadbeefL.U)
+      c.io.uncoalResps(0)(1).bits.data.expect(0x5ca1ab1eL.U)
      c.io.uncoalResps(0)(1).bits.source.expect(2.U)
      c.io.uncoalResps(2)(0).bits.data.expect(0x89abcdefL.U)
      c.io.uncoalResps(2)(0).bits.source.expect(2.U)
@@ -468,6 +613,67 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester {
      c.io.uncoalResps(2)(1).bits.source.expect(2.U)
    }
  }
+
+  it should "uncoalesce when coalesced to the same word offset" in {
+    test(new Uncoalescer(uncoalescerTestConfig))
+    // .withAnnotations(Seq(VcsBackendAnnotation))
+    { c =>
+      val sourceId = 0.U
+      val four = c.io.newEntry.sizeEnumT.FOUR
+      c.io.coalReqValid.poke(true.B)
+      c.io.newEntry.source.poke(sourceId)
+      c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B)
+      c.io.newEntry.lanes(0).reqs(0).source.poke(0.U)
+      c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U)
+      c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four)
+      c.io.newEntry.lanes(0).reqs(1).valid.poke(false.B)
+      c.io.newEntry.lanes(1).reqs(0).valid.poke(true.B)
+      c.io.newEntry.lanes(1).reqs(0).source.poke(1.U)
+      c.io.newEntry.lanes(1).reqs(0).offset.poke(1.U)
+      c.io.newEntry.lanes(1).reqs(0).sizeEnum.poke(four)
+      c.io.newEntry.lanes(1).reqs(1).valid.poke(false.B)
+      c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B)
+      c.io.newEntry.lanes(2).reqs(0).source.poke(2.U)
+      c.io.newEntry.lanes(2).reqs(0).offset.poke(1.U)
+      c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four)
+      c.io.newEntry.lanes(2).reqs(1).valid.poke(false.B)
+      c.io.newEntry.lanes(3).reqs(0).valid.poke(true.B)
+      c.io.newEntry.lanes(3).reqs(0).source.poke(3.U)
+      c.io.newEntry.lanes(3).reqs(0).offset.poke(1.U)
+      c.io.newEntry.lanes(3).reqs(0).sizeEnum.poke(four)
+      c.io.newEntry.lanes(3).reqs(1).valid.poke(false.B)
+
+      c.clock.step()
+
+      c.io.coalReqValid.poke(false.B)
+
+      c.clock.step()
+
+      c.io.coalResp.valid.poke(true.B)
+      c.io.coalResp.bits.source.poke(sourceId)
+      val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL)
+      c.io.coalResp.bits.data.poke(lit.U)
+
+      // table lookup is combinational at the same cycle
+      // offset is counting from LSB
+      c.io.uncoalResps(0)(0).valid.expect(true.B)
+      c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(0)(0).bits.source.expect(0.U)
+      c.io.uncoalResps(0)(1).valid.expect(false.B)
+      c.io.uncoalResps(1)(0).valid.expect(true.B)
+      c.io.uncoalResps(1)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(1)(0).bits.source.expect(1.U)
+      c.io.uncoalResps(1)(1).valid.expect(false.B)
+      c.io.uncoalResps(2)(0).valid.expect(true.B)
+      c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(2)(0).bits.source.expect(2.U)
+      c.io.uncoalResps(2)(1).valid.expect(false.B)
+      c.io.uncoalResps(3)(0).valid.expect(true.B)
+      c.io.uncoalResps(3)(0).bits.data.expect(0x5ca1ab1eL.U)
+      c.io.uncoalResps(3)(0).bits.source.expect(3.U)
+      c.io.uncoalResps(3)(1).valid.expect(false.B)
+    }
+  }
 }

 class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester {