From 0c8909cb43d3d48a9a2f59736e943831e914982b Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 11 May 2023 16:11:39 -0700
Subject: [PATCH] scalafmt

---
 src/main/scala/tilelink/Coalescing.scala | 480 ++++++++++++++---------
 1 file changed, 302 insertions(+), 178 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 373337a..0e72dae 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -152,7 +152,7 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In
       fromSource = this.source,
       toAddress = this.address,
       lgSize = this.size,
-      data = this.data,
+      data = this.data
     )
     val (glegal, gbits) = edgeOut.Get(
       fromSource = this.source,
@@ -166,17 +166,22 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In
   }
 }
 case class NonCoalescedRequest(config: CoalescerConfig)
-extends Request(sourceWidth = log2Ceil(config.numOldSrcIds),
-  sizeWidth = config.wordSizeWidth,
-  addressWidth = config.addressWidth,
-  dataWidth = config.wordSizeInBytes * 8)
+    extends Request(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      addressWidth = config.addressWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
 case class CoalescedRequest(config: CoalescerConfig)
-extends Request(sourceWidth = log2Ceil(config.numNewSrcIds),
-  sizeWidth = log2Ceil(config.maxCoalLogSize),
-  addressWidth = config.addressWidth,
-  dataWidth = (8 * (1 << config.maxCoalLogSize)))
+    extends Request(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      addressWidth = config.addressWidth,
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )
 
-class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle {
+class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int)
+    extends Bundle {
   val op = UInt(1.W) // 0=READ 1=WRITE
   val size = UInt(sizeWidth.W)
   val source = UInt(sourceWidth.W)
@@ -205,17 +210,22 @@ class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle
   }
 }
 case class NonCoalescedResponse(config: CoalescerConfig)
-extends Response(sourceWidth = log2Ceil(config.numOldSrcIds),
-  sizeWidth = config.wordSizeWidth,
-  dataWidth = config.wordSizeInBytes * 8)
+    extends Response(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
 case class CoalescedResponse(config: CoalescerConfig)
-extends Response(sourceWidth = log2Ceil(config.numNewSrcIds),
-  sizeWidth = log2Ceil(config.maxCoalLogSize),
-  dataWidth = (8 * (1 << config.maxCoalLogSize)))
+    extends Response(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )
 
 // If `ignoreInUse`, just keep giving out new IDs without checking if it is in
 // use.
-class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module {
+class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true)
+    extends Module {
   val io = IO(new Bundle {
     val gen = Input(Bool())
     val reclaim = Input(Valid(UInt(sourceWidth.W)))
@@ -234,15 +244,16 @@ class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) e
 
   io.id.valid := (if (ignoreInUse) true.B else !occupancyTable(head).valid)
   io.id.bits := head
-  when (io.gen && io.id.valid /* fire */) {
+  when(io.gen && io.id.valid /* fire */ ) {
     occupancyTable(io.id.bits).valid := true.B // mark in use
   }
-  when (io.reclaim.valid) {
+  when(io.reclaim.valid) {
     occupancyTable(io.reclaim.bits).valid := false.B // mark freed
   }
 }
 
-class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) extends Module {
+class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
+    extends Module {
   val io = IO(new Bundle {
     val queue = new Bundle {
       val enq = Vec(config.numLanes, DeqIO(gen.cloneType))
@@ -259,7 +270,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 //  eltPrototype.valid := false.B
 
   val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen))))
-  val writePtr = RegInit(VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))))
+  val writePtr = RegInit(
+    VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W)))
+  )
   val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B)))
 
   private def resetElts = {
@@ -270,7 +283,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
       }
     }
   }
-  when (reset.asBool) {
+  when(reset.asBool) {
     resetElts
   }
 
@@ -286,14 +299,17 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
   // current cycle.
   //
   // shift hint is when the heads have no more coalescable left this or next cycle
-  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) =>
-    c && !(io.invalidate.valid && inv)
-  }.reduce(_ || _)
+  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0)))
+    .map { case (c, inv) =>
+      c && !(io.invalidate.valid && inv)
+    }
+    .reduce(_ || _)
   val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
   // valid && !fire means we enable enqueueing to a full queue, provided the
   // arbiter is taking away all remaining valid queue heads in the next cycle so
   // that we make space for the entire next warp.
-  val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
+  val syncedDeqValidNextCycle =
+    io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
 
   for (i <- 0 until config.numLanes) {
     val enq = io.queue.enq(i)
@@ -313,20 +329,22 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
     // can take new entries if not empty, or if full but shifting
     enq.ready := (!ctrl.full) || ctrl.shift
 
-    when (ctrl.shift) {
+    when(ctrl.shift) {
       // shift, invalidate tail, invalidate coalesced requests
       elts(i).zipWithIndex.foreach { case (elt, j) =>
         if (j == entries - 1) { // tail
           elt.valid := false.B
         } else {
           elt.bits := elts(i)(j + 1).bits
-          elt.valid := elts(i)(j + 1).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
+          elt.valid := elts(i)(
+            j + 1
+          ).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
         }
       }
       // reset dequeue mask when new entries are shifted in
       deqDone(i) := false.B
       // enqueue
-      when (enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
+      when(enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
         elts(i)(writePtr(i) - 1.U).bits := enq.bits
         elts(i)(writePtr(i) - 1.U).valid := enq.valid
       }.otherwise {
@@ -334,13 +352,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
       }
     }.otherwise {
       // invalidate coalesced requests
-      when (io.invalidate.valid) {
+      when(io.invalidate.valid) {
         (elts(i) zip io.invalidate.bits(i).asBools).map { case (elt, inv) =>
           elt.valid := elt.valid && !inv
         }
       }
       // enqueue
-      when (enq.ready && syncedEnqValid) {
+      when(enq.ready && syncedEnqValid) {
         elts(i)(writePtr(i)).bits := enq.bits
         elts(i)(writePtr(i)).valid := enq.valid
         writePtr(i) := writePtr(i) + 1.U
@@ -352,8 +370,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
   // When doing spatial-only coalescing, queues should never drift from each
   // other, i.e. the queue heads should always contain mem requests from the
   // same instruction.
-  val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
-    writePtr.map(_ === writePtr.head).reduce(_ && _)
+  val queueInSync =
+    controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
+      writePtr.map(_ === writePtr.head).reduce(_ && _)
   assert(queueInSync, "shift queue lanes are not in sync")
 
   io.mask := elts.map(x => VecInit(x.map(_.valid)).asUInt)
@@ -361,8 +380,11 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 }
 
 // Software model: coalescer.py
-class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedRequest],
-                    config: CoalescerConfig) extends Module {
+class MonoCoalescer(
+    coalLogSize: Int,
+    windowT: CoalShiftQueue[NonCoalescedRequest],
+    config: CoalescerConfig
+) extends Module {
   val io = IO(new Bundle {
     val window = Input(windowT.io.cloneType)
     val results = Output(new Bundle {
@@ -371,8 +393,10 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
       val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
       // number of entries matched with this leader lane's head.
       // maximum is numLanes * queueDepth
-      val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
-      val coverageHits = Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
+      val matchCount =
+        Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
+      val coverageHits =
+        Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
       val canCoalesce = Output(Vec(config.numLanes, Bool()))
     })
   })
@@ -386,9 +410,13 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
   val leadersValid = io.window.mask.map(_.asBools.head)
 
   def printQueueHeads = {
-    leaders.zipWithIndex.foreach{ case (head, i) =>
-      printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
-        leadersValid(i), head.source, head.address)
+    leaders.zipWithIndex.foreach { case (head, i) =>
+      printf(
+        s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
+        leadersValid(i),
+        head.source,
+        head.address
+      )
     }
   }
   // when (leadersValid.reduce(_ || _)) {
@@ -406,34 +434,42 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
   // Gives a 2-D table of Bools representing match at every queue entry,
   // for each lane (so 3-D in total).
   // dimensions: (leader lane, follower lane, follower entry)
-  val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) =>
-    (io.window.elts zip io.window.mask).map { case (followers, followerValids) =>
-      // compare leader's head against follower's every queue entry
-      (followers zip followerValids.asBools).map { case (follower, followerValid) =>
-        canMatch(follower, followerValid, leader, leaderValid)
-        // FIXME: disabling halving optimization because it does not give the
-        // correct per-lane coalescable indication to the shift queue
-          // // match leader to only followers at lanes >= leader idx
-          // // this halves the number of comparators
-          // if (followerIndex < leaderIndex) false.B
-          // else canMatch(follower, followerValid, leader, leaderValid)
+  val matchTablePerLane = (leaders zip leadersValid).map {
+    case (leader, leaderValid) =>
+      (io.window.elts zip io.window.mask).map {
+        case (followers, followerValids) =>
+          // compare leader's head against follower's every queue entry
+          (followers zip followerValids.asBools).map {
+            case (follower, followerValid) =>
+              canMatch(follower, followerValid, leader, leaderValid)
+            // FIXME: disabling halving optimization because it does not give the
+            // correct per-lane coalescable indication to the shift queue
+            // // match leader to only followers at lanes >= leader idx
+            // // this halves the number of comparators
+            // if (followerIndex < leaderIndex) false.B
+            // else canMatch(follower, followerValid, leader, leaderValid)
+          }
       }
-    }
   }
 
   val matchCounts = matchTablePerLane.map(table =>
-      table.map(PopCount(_)) // sum up each column
-           .reduce(_ +& _))
+    table
+      .map(PopCount(_)) // sum up each column
+      .reduce(_ +& _)
+  )
   val canCoalesce = matchCounts.map(_ > 1.U)
 
   // Elect the leader that has the most match counts.
   // TODO: potentially expensive: magnitude comparator
   def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
-    matchCounts.zipWithIndex.map {
-      case (c, i) => (c, i.U)
-    }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
+    matchCounts.zipWithIndex
+      .map { case (c, i) =>
+        (c, i.U)
+      }
+      .reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
         (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
-    }._2
+      }
+      ._2
   }
   // Elect leader by choosing the smallest-index lane that has a valid
   // match, i.e. using priority encoder.
@@ -444,7 +480,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
 
   val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux
   // matchTable for the chosen lane, but converted to a Vec[UInt]
-  val chosenMatches = VecInit(matchTablePerLane.map{ table =>
+  val chosenMatches = VecInit(matchTablePerLane.map { table =>
     VecInit(table.map(VecInit(_).asUInt))
   })(chosenLeaderIdx)
   val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)
@@ -452,18 +488,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
   // coverage calculation
   def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordSizeWidth)
   // 2-D table flattened to 1-D
-  val offsets = io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
+  val offsets =
+    io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
   val valids = chosenMatches.flatMap(_.asBools)
   // indicates for each word in the coalesced chunk whether it is accessed by
   // any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four
   // words in the coalesced data coming back will be accessed by some request
   // and we've reached 100% bandwidth utilization.
   val hits = Seq.tabulate(1 << (size - config.wordSizeWidth)) { target =>
-    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+    (offsets zip valids)
+      .map { case (offset, valid) => valid && (offset === target.U) }
+      .reduce(_ || _)
   }
 
   // debug prints
-  when (leadersValid.reduce(_ || _)) {
+  when(leadersValid.reduce(_ || _)) {
     matchCounts.zipWithIndex.foreach { case (count, i) =>
       printf(s"lane[${i}] matchCount = %d\n", count);
     }
@@ -492,20 +531,26 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
 // coalesced request out of all possible combinations.
 //
 // Software model: coalescer.py
-class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Request,
-                     config: CoalescerConfig) extends Module {
+class MultiCoalescer(
+    windowT: CoalShiftQueue[NonCoalescedRequest],
+    coalReqT: Request,
+    config: CoalescerConfig
+) extends Module {
   val io = IO(new Bundle {
     // coalescing window, connected to the contents of the request queues
     val window = Input(windowT.io.cloneType)
     // generated coalesced request
     val coalReq = DecoupledIO(coalReqT.cloneType)
     // invalidate signals going into each request queue's head
-    val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
+    val invalidate =
+      Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
     // whether a lane is coalescable
     val coalescable = Output(Vec(config.numLanes, Bool()))
   })
 
-  val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
+  val coalescers = config.coalLogSizes.map(size =>
+    Module(new MonoCoalescer(size, windowT, config))
+  )
   coalescers.foreach(_.io.window := io.window)
 
   def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
@@ -530,9 +575,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
   val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
   val chosenValid = Wire(Bool())
   // minimum 25% coverage
-  val minCoverage = 1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
+  val minCoverage =
+    1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
 
-  when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
+  when(normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
     chosenSizeIdx := argMax(normalizedHits)
     chosenValid := true.B
     printf("coalescing success by coverage policy\n")
@@ -562,9 +608,14 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
   val flatMatches = chosenBundle.matchOH.flatMap(_.asBools)
 
   // check for word alignment in addresses
-  assert(io.window.elts.flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U)).zip(
-    io.window.mask.flatMap(_.asBools)).map { case (aligned, valid) => (!valid) || aligned }.reduce(_ || _),
-    "one or more addresses used for coalescing is not word-aligned")
+  assert(
+    io.window.elts
+      .flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U))
+      .zip(io.window.mask.flatMap(_.asBools))
+      .map { case (aligned, valid) => (!valid) || aligned }
+      .reduce(_ || _),
+    "one or more addresses used for coalescing is not word-aligned"
+  )
 
   // note: this is word-level coalescing. if finer granularity is needed, need to modify code
   val numWords = (1.U << (chosenSize - config.wordSizeWidth.U)).asUInt
@@ -579,18 +630,29 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
     val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
       // note: ANDing against addrMask is to conform to active byte lanes requirements
       // if aligning to LSB suffices, we should add the bitwise AND back
-      m && ((req.address(config.maxCoalLogSize - 1, config.wordSizeWidth)/* & addrMask*/) === i.U)
+      m && ((req.address(
+        config.maxCoalLogSize - 1,
+        config.wordSizeWidth
+      ) /* & addrMask*/ ) === i.U)
     }
     // TODO: SW uses priority encoder, not sure about behavior of MuxCase
-    data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) =>
-      s -> req.data
-    })
-    mask(i) := MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) =>
-      s -> req.mask
-    })
+    data(i) := MuxCase(
+      DontCare,
+      flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.data
+      }
+    )
+    mask(i) := MuxCase(
+      0.U,
+      flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.mask
+      }
+    )
   }
 
-  val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds)))
+  val sourceGen = Module(
+    new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds))
+  )
   sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created
   sourceGen.io.reclaim.valid := false.B // not used
   sourceGen.io.reclaim.bits := DontCare // not used
@@ -608,7 +670,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
   io.invalidate.bits := chosenBundle.matchOH
   io.invalidate.valid := io.coalReq.fire // invalidate only when fire
 
-  io.coalescable := coalescers.map(_.io.results.canCoalesce.asUInt).reduce(_ | _).asBools
+  io.coalescable := coalescers
+    .map(_.io.results.canCoalesce.asUInt)
+    .reduce(_ | _)
+    .asBools
 
   dontTouch(io.invalidate) // debug
 
@@ -620,21 +685,30 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
   if (!config.enable) disable
 }
 
-class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) {
-  require(outer.cpuNode.in.length == config.numLanes,
+class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
+    extends LazyModuleImp(outer) {
+  require(
+    outer.cpuNode.in.length == config.numLanes,
     s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " +
-    s"config.numLanes (${config.numLanes})")
-  require(outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
+      s"config.numLanes (${config.numLanes})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
     s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " +
-    s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})")
-  require(outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
+      s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
     s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
-    s"mismatch with config.addressWidth (${config.addressWidth})")
+      s"mismatch with config.addressWidth (${config.addressWidth})"
+  )
 
   val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
   // note we are using word size. assuming all coalescer inputs are word sized
   val reqQueueEntryT = new NonCoalescedRequest(config)
-  val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config))
+  val reqQueues = Module(
+    new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config)
+  )
 
   val coalReqT = new CoalescedRequest(config)
   val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
@@ -710,7 +784,6 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
   tlCoal.e.valid := false.B
 
-
   // ===========================================================================
   // Response flow
   // ===========================================================================
@@ -723,8 +796,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   val numPerLaneReqs = config.queueDepth
 
   // FIXME: no need to contain maxCoalLogSize data
-  val respQueueEntryT = new Response(oldSourceWidth, log2Ceil(config.maxCoalLogSize),
-    (1 << config.maxCoalLogSize) * 8)
+  val respQueueEntryT = new Response(
+    oldSourceWidth,
+    log2Ceil(config.maxCoalLogSize),
+    (1 << config.maxCoalLogSize) * 8
+  )
   val respQueues = Seq.tabulate(config.numLanes) { _ =>
     Module(
       new MultiPortQueue(
@@ -810,12 +886,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   val newEntry = Wire(uncoalescer.inflightTable.entryT)
   newEntry.source := coalescer.io.coalReq.bits.source
 
-  assert (config.maxCoalLogSize <= config.dataBusWidth,
-    "multi-beat coalesced reads/writes are currently not supported")
-  assert (
+  assert(
+    config.maxCoalLogSize <= config.dataBusWidth,
+    "multi-beat coalesced reads/writes are currently not supported"
+  )
+  assert(
     tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
     s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
-    + s" (${(1 << config.dataBusWidth) * 8})"
+      + s" (${(1 << config.dataBusWidth) * 8})"
   )
   val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
   // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
@@ -825,8 +903,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
       (laneEntry.reqs zip laneInv.asBools).zipWithIndex
         .foreach { case ((reqEntry, inv), i) =>
           val req = reqQueues.io.elts(lane)(i)
-          when ((coalescer.io.invalidate.valid && inv)) {
-            printf(s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", req.source)
+          when((coalescer.io.invalidate.valid && inv)) {
+            printf(
+              s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
+              req.source
+            )
           }
           reqEntry.valid := (coalescer.io.invalidate.valid && inv)
           reqEntry.source := req.source
@@ -845,22 +926,23 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   tlCoal.d.ready := uncoalescer.io.coalResp.ready
 
   // Connect uncoalescer results back into each lane's response queue
-  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) =>
-    perLaneResps.zipWithIndex.foreach { case (resp, i) =>
-      // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
-      // cache.  This should ideally not happen though.
-      assert(
-        q.io.enq(respQueueUncoalPortOffset + i).ready,
-        s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
-      )
-      q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
-      q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
+  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach {
+    case ((q, perLaneResps), lane) =>
+      perLaneResps.zipWithIndex.foreach { case (resp, i) =>
+        // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
+        // cache.  This should ideally not happen though.
+        assert(
+          q.io.enq(respQueueUncoalPortOffset + i).ready,
+          s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
+        )
+        q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
+        q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
       // debug
       // when (resp.valid) {
       //   printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
       // }
       // dontTouch(q.io.enq(respQueueCoalPortOffset))
-    }
+      }
   }
 
   // Debug
@@ -972,7 +1054,8 @@ class Uncoalescer(config: CoalescerConfig) extends Module {
 // split the coalesced response back to individual per-lane responses with the
 // right metadata.
 class InflightCoalReqTable(config: CoalescerConfig) extends Module {
-  val offsetBits = config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
+  val offsetBits =
+    config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
   val entryT = new InflightCoalReqTableEntry(
     config.numLanes,
     config.queueDepth,
@@ -1019,7 +1102,7 @@ class InflightCoalReqTable(config: CoalescerConfig) extends Module {
   }
 
   val full = Wire(Bool())
-  full := (0 until entries).map( table(_).valid ).reduce( _ && _ )
+  full := (0 until entries).map(table(_).valid).reduce(_ && _)
   assert(!full, "inflight table is full and blocking coalescer")
   dontTouch(full)
 
@@ -1094,8 +1177,12 @@ object TLUtils {
 // `traceHasSource` is true if the input trace file has an additional source
 // ID column.  This is useful for using the output trace file genereated by
 // MemTraceLogger as the driver.
-class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource: Boolean = false)
-  (implicit p: Parameters) extends LazyModule {
+class MemTraceDriver(
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean = false
+)(implicit p: Parameters)
+    extends LazyModule {
   // Create N client nodes together
   val laneNodes = Seq.tabulate(config.numLanes) { i =>
     val clientParam = Seq(
@@ -1113,7 +1200,8 @@ class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource:
   val node = TLIdentityNode()
   laneNodes.foreach { l => node := l }
 
-  lazy val module = new MemTraceDriverImp(this, config, filename, traceHasSource)
+  lazy val module =
+    new MemTraceDriverImp(this, config, filename, traceHasSource)
 }
 
 trait HasTraceLine {
@@ -1136,9 +1224,12 @@ class TraceLine extends Bundle with HasTraceLine {
   val data = UInt(64.W)
 }
 
-class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename: String,
-  traceHasSource: Boolean)
-    extends LazyModuleImp(outer)
+class MemTraceDriverImp(
+    outer: MemTraceDriver,
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean
+) extends LazyModuleImp(outer)
     with UnitTestModule {
   // Current cycle mark to read from trace
   val traceReadCycle = RegInit(1.U(64.W))
@@ -1176,7 +1267,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
 
   // Not all fire because trace cycle has to advance even when there is no valid
   // line in the trace.
-  when (reqQueueAllReady){
+  when(reqQueueAllReady) {
     traceReadCycle := traceReadCycle + 1.U
   }
 
@@ -1216,11 +1307,16 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
     sizeInBytes := (1.U) << req.size
     mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
     wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
-    val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
+    val wordAlignedAddress =
+      req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
     val wordAlignedSize = Mux(subword, 2.U, req.size)
 
-    val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numOldSrcIds),
-      ignoreInUse = false))
+    val sourceGen = Module(
+      new RoundRobinSourceGenerator(
+        log2Ceil(config.numOldSrcIds),
+        ignoreInUse = false
+      )
+    )
     sourceGen.io.gen := reqQ.io.deq.fire
     // assert(sourceGen.io.id.valid)
 
@@ -1229,7 +1325,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
       toAddress = hashToValidPhyAddr(wordAlignedAddress),
       lgSize = wordAlignedSize, // trace line already holds log2(size)
       // data should be aligned to beatBytes
-      data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
+      data =
+        (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
     )
     val (glegal, gbits) = edge.Get(
       fromSource = sourceGen.io.id.bits,
@@ -1240,7 +1337,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
     val bits = Mux(req.is_store, pbits, gbits)
 
     tlOut.a.valid := (reqQ.io.deq.valid && sourceGen.io.id.valid)
-    when (tlOut.a.valid) {
+    when(tlOut.a.valid) {
       assert(legal, "illegal TL req gen")
     }
     tlOut.a.bits := bits
@@ -1288,9 +1385,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
 
 class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
     extends BlackBox(
-      Map("FILENAME" -> filename,
-          "NUM_LANES" -> numLanes,
-          "HAS_SOURCE" -> (if (traceHasSource) 1 else 0))
+      Map(
+        "FILENAME" -> filename,
+        "NUM_LANES" -> numLanes,
+        "HAS_SOURCE" -> (if (traceHasSource) 1 else 0)
+      )
     )
     with HasBlackBoxResource {
   val traceLineT = new TraceLine
@@ -1304,19 +1403,20 @@ class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
 
     // These names have to match declarations in the Verilog code, eg.
     // trace_read_address.
-    val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source
-      val ready = Input(Bool())
-      val valid = Output(UInt(numLanes.W))
-      // Chisel can't interface with Verilog 2D port, so flatten all lanes into
-      // single wide 1D array.
-      // TODO: assumes 64-bit address.
-      val cycle = Input(UInt(64.W))
-      val address = Output(UInt((addrW * numLanes).W))
-      val is_store = Output(UInt(numLanes.W))
-      val size = Output(UInt((sizeW * numLanes).W))
-      val data = Output(UInt((dataW * numLanes).W))
-      val finished = Output(Bool())
-    }
+    val trace_read =
+      new Bundle { // can't use HasTraceLine because this doesn't have source
+        val ready = Input(Bool())
+        val valid = Output(UInt(numLanes.W))
+        // Chisel can't interface with Verilog 2D port, so flatten all lanes into
+        // single wide 1D array.
+        // TODO: assumes 64-bit address.
+        val cycle = Input(UInt(64.W))
+        val address = Output(UInt((addrW * numLanes).W))
+        val is_store = Output(UInt(numLanes.W))
+        val size = Output(UInt((sizeW * numLanes).W))
+        val data = Output(UInt((dataW * numLanes).W))
+        val finished = Output(Bool())
+      }
   })
 
   addResource("/vsrc/SimMemTrace.v")
@@ -1443,11 +1543,11 @@ class MemTraceLogger(
 
         // This assert only holds true for PutFullData and not PutPartialData,
         // where HIGH bits in the mask may not be contiguous.
-        when (tlIn.a.valid) {
+        when(tlIn.a.valid) {
           assert(
             PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
             "mask HIGH popcount do not match the TL size. " +
-            "Partial masks are not allowed for PutFull"
+              "Partial masks are not allowed for PutFull"
           )
         }
         val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
@@ -1476,17 +1576,25 @@ class MemTraceLogger(
 
     // stats
     val numReqsThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
     val numRespsThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
     val reqBytesThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
-        b0 + b1
-      }
+      laneReqs
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
+          b0 + b1
+        }
     val respBytesThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
-        b0 + b1
-      }
+      laneResps
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
+          b0 + b1
+        }
     numReqs := numReqs + numReqsThisCycle
     numResps := numResps + numRespsThisCycle
     reqBytes := reqBytes + reqBytesThisCycle
@@ -1496,7 +1604,10 @@ class MemTraceLogger(
     //
     // This is a clunky workaround of the fact that Chisel doesn't allow partial
     // assignment to a bitfield range of a wide signal.
-    def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
+    def flattenTrace(
+        simIO: Bundle with HasTraceLine,
+        perLane: Vec[TraceLine]
+    ) = {
       // these will get optimized out
       val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
       val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
@@ -1592,8 +1703,14 @@ object TLPrintf {
       tlData: UInt,
       reqData: UInt
   ) = {
-    printf(s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
-      source, address, size, mask, is_store)
+    printf(
+      s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
+      source,
+      address,
+      size,
+      mask,
+      is_store
+    )
     when(is_store) {
       printf(", tlData=%x, reqData=%x", tlData, reqData)
     }
@@ -1604,7 +1721,7 @@ object TLPrintf {
 // Synthesizable unit tests
 
 class DummyDriver(config: CoalescerConfig)(implicit p: Parameters)
-  extends LazyModule {
+    extends LazyModule {
   val laneNodes = Seq.tabulate(config.numLanes) { i =>
     val clientParam = Seq(
       TLMasterParameters.v1(
@@ -1640,7 +1757,10 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
     // generate dummy traffic to coalescer to prevent it from being optimized
     // out during synthesis
     val address = Wire(UInt(config.addressWidth.W))
-    address := Cat((finishCounter + (lane.U % 3.U)), 0.U(config.wordSizeWidth.W))
+    address := Cat(
+      (finishCounter + (lane.U % 3.U)),
+      0.U(config.wordSizeWidth.W)
+    )
     val (tl, edge) = node.out(0)
     val (legal, bits) = edge.Put(
       fromSource = sourceIdCounter,
@@ -1657,11 +1777,13 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
     tl.e.valid := false.B
   }
 
-  val dataSum = outer.laneNodes.map { node =>
-    val tl = node.out(0)._1
-    val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
-    data
-  }.reduce (_ +& _)
+  val dataSum = outer.laneNodes
+    .map { node =>
+      val tl = node.out(0)._1
+      val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
+      data
+    }
+    .reduce(_ +& _)
   // this doesn't make much sense, but it prevents the entire uncoalescer from
   // being optimized away
   finishCounter := finishCounter + dataSum
@@ -1680,8 +1802,10 @@ class DummyCoalescer(implicit p: Parameters) extends LazyModule {
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
     )
   )
 
@@ -1704,7 +1828,8 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
 }
 
 // tracedriver --> coalescer --> tracelogger --> tlram
-class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule {
+class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters)
+    extends LazyModule {
   val numLanes = p(SIMTCoreKey).get.nLanes
   val config = defaultConfig.copy(numLanes = numLanes)
 
@@ -1713,14 +1838,18 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
     new MemTraceLogger(numLanes, filename, loggerName = "coreside")
   )
   val coal = LazyModule(new CoalescingUnit(config))
-  val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside"))
+  val memSideLogger = LazyModule(
+    new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")
+  )
   val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
     LazyModule(
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
     )
   )
 
@@ -1751,8 +1880,9 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
   }
 }
 
-class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters)
-    extends UnitTest(timeout) {
+class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit
+    p: Parameters
+) extends UnitTest(timeout) {
   val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
@@ -1770,8 +1900,10 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
       // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
       // edges globally, by way of Diplomacy communicating the TL slave
       // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << defaultConfig.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << defaultConfig.dataBusWidth)
+      )
     )
   )
 
@@ -1785,13 +1917,13 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
   }
 }
 
-class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) {
+class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
+    extends UnitTest(timeout) {
   val dut = Module(LazyModule(new TLRAMCoalescer).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
 }
 
-
 ////////////
 ////////////
 ////////////
@@ -1941,11 +2073,3 @@ class CoalescerXbarImpl(outer: CoalescerXbar,
 
 
   }
-
-
-
-
-
-
-
-