scalafmt

2023-05-11 16:11:39 -07:00
parent 7fa6be4a8b
commit 0c8909cb43
1 changed files with 302 additions and 178 deletions
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -152,7 +152,7 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In
      fromSource = this.source,
      toAddress = this.address,
      lgSize = this.size,
-      data = this.data,
+      data = this.data
    )
    val (glegal, gbits) = edgeOut.Get(
      fromSource = this.source,
@@ -166,17 +166,22 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In
  }
 }
 case class NonCoalescedRequest(config: CoalescerConfig)
-extends Request(sourceWidth = log2Ceil(config.numOldSrcIds),
-  sizeWidth = config.wordSizeWidth,
-  addressWidth = config.addressWidth,
-  dataWidth = config.wordSizeInBytes * 8)
+    extends Request(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      addressWidth = config.addressWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
 case class CoalescedRequest(config: CoalescerConfig)
-extends Request(sourceWidth = log2Ceil(config.numNewSrcIds),
-  sizeWidth = log2Ceil(config.maxCoalLogSize),
-  addressWidth = config.addressWidth,
-  dataWidth = (8 * (1 << config.maxCoalLogSize)))
+    extends Request(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      addressWidth = config.addressWidth,
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )

-class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle {
+class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int)
+    extends Bundle {
  val op = UInt(1.W) // 0=READ 1=WRITE
  val size = UInt(sizeWidth.W)
  val source = UInt(sourceWidth.W)
@@ -205,17 +210,22 @@ class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle
  }
 }
 case class NonCoalescedResponse(config: CoalescerConfig)
-extends Response(sourceWidth = log2Ceil(config.numOldSrcIds),
-  sizeWidth = config.wordSizeWidth,
-  dataWidth = config.wordSizeInBytes * 8)
+    extends Response(
+      sourceWidth = log2Ceil(config.numOldSrcIds),
+      sizeWidth = config.wordSizeWidth,
+      dataWidth = config.wordSizeInBytes * 8
+    )
 case class CoalescedResponse(config: CoalescerConfig)
-extends Response(sourceWidth = log2Ceil(config.numNewSrcIds),
-  sizeWidth = log2Ceil(config.maxCoalLogSize),
-  dataWidth = (8 * (1 << config.maxCoalLogSize)))
+    extends Response(
+      sourceWidth = log2Ceil(config.numNewSrcIds),
+      sizeWidth = log2Ceil(config.maxCoalLogSize),
+      dataWidth = (8 * (1 << config.maxCoalLogSize))
+    )

 // If `ignoreInUse`, just keep giving out new IDs without checking if it is in
 // use.
-class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module {
+class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true)
+    extends Module {
  val io = IO(new Bundle {
    val gen = Input(Bool())
    val reclaim = Input(Valid(UInt(sourceWidth.W)))
@@ -234,15 +244,16 @@ class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) e

  io.id.valid := (if (ignoreInUse) true.B else !occupancyTable(head).valid)
  io.id.bits := head
-  when (io.gen && io.id.valid /* fire */) {
+  when(io.gen && io.id.valid /* fire */ ) {
    occupancyTable(io.id.bits).valid := true.B // mark in use
  }
-  when (io.reclaim.valid) {
+  when(io.reclaim.valid) {
    occupancyTable(io.reclaim.bits).valid := false.B // mark freed
  }
 }

-class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) extends Module {
+class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
+    extends Module {
  val io = IO(new Bundle {
    val queue = new Bundle {
      val enq = Vec(config.numLanes, DeqIO(gen.cloneType))
@@ -259,7 +270,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 //  eltPrototype.valid := false.B

  val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen))))
-  val writePtr = RegInit(VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))))
+  val writePtr = RegInit(
+    VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W)))
+  )
  val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B)))

  private def resetElts = {
@@ -270,7 +283,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
      }
    }
  }
-  when (reset.asBool) {
+  when(reset.asBool) {
    resetElts
  }

@@ -286,14 +299,17 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
  // current cycle.
  //
  // shift hint is when the heads have no more coalescable left this or next cycle
-  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) =>
-    c && !(io.invalidate.valid && inv)
-  }.reduce(_ || _)
+  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0)))
+    .map { case (c, inv) =>
+      c && !(io.invalidate.valid && inv)
+    }
+    .reduce(_ || _)
  val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
  // valid && !fire means we enable enqueueing to a full queue, provided the
  // arbiter is taking away all remaining valid queue heads in the next cycle so
  // that we make space for the entire next warp.
-  val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
+  val syncedDeqValidNextCycle =
+    io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)

  for (i <- 0 until config.numLanes) {
    val enq = io.queue.enq(i)
@@ -313,20 +329,22 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
    // can take new entries if not empty, or if full but shifting
    enq.ready := (!ctrl.full) || ctrl.shift

-    when (ctrl.shift) {
+    when(ctrl.shift) {
      // shift, invalidate tail, invalidate coalesced requests
      elts(i).zipWithIndex.foreach { case (elt, j) =>
        if (j == entries - 1) { // tail
          elt.valid := false.B
        } else {
          elt.bits := elts(i)(j + 1).bits
-          elt.valid := elts(i)(j + 1).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
+          elt.valid := elts(i)(
+            j + 1
+          ).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
        }
      }
      // reset dequeue mask when new entries are shifted in
      deqDone(i) := false.B
      // enqueue
-      when (enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
+      when(enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
        elts(i)(writePtr(i) - 1.U).bits := enq.bits
        elts(i)(writePtr(i) - 1.U).valid := enq.valid
      }.otherwise {
@@ -334,13 +352,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
      }
    }.otherwise {
      // invalidate coalesced requests
-      when (io.invalidate.valid) {
+      when(io.invalidate.valid) {
        (elts(i) zip io.invalidate.bits(i).asBools).map { case (elt, inv) =>
          elt.valid := elt.valid && !inv
        }
      }
      // enqueue
-      when (enq.ready && syncedEnqValid) {
+      when(enq.ready && syncedEnqValid) {
        elts(i)(writePtr(i)).bits := enq.bits
        elts(i)(writePtr(i)).valid := enq.valid
        writePtr(i) := writePtr(i) + 1.U
@@ -352,8 +370,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
  // When doing spatial-only coalescing, queues should never drift from each
  // other, i.e. the queue heads should always contain mem requests from the
  // same instruction.
-  val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
-    writePtr.map(_ === writePtr.head).reduce(_ && _)
+  val queueInSync =
+    controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
+      writePtr.map(_ === writePtr.head).reduce(_ && _)
  assert(queueInSync, "shift queue lanes are not in sync")

  io.mask := elts.map(x => VecInit(x.map(_.valid)).asUInt)
@@ -361,8 +380,11 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
 }

 // Software model: coalescer.py
-class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedRequest],
-                    config: CoalescerConfig) extends Module {
+class MonoCoalescer(
+    coalLogSize: Int,
+    windowT: CoalShiftQueue[NonCoalescedRequest],
+    config: CoalescerConfig
+) extends Module {
  val io = IO(new Bundle {
    val window = Input(windowT.io.cloneType)
    val results = Output(new Bundle {
@@ -371,8 +393,10 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
      val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
      // number of entries matched with this leader lane's head.
      // maximum is numLanes * queueDepth
-      val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
-      val coverageHits = Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
+      val matchCount =
+        Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
+      val coverageHits =
+        Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
      val canCoalesce = Output(Vec(config.numLanes, Bool()))
    })
  })
@@ -386,9 +410,13 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
  val leadersValid = io.window.mask.map(_.asBools.head)

  def printQueueHeads = {
-    leaders.zipWithIndex.foreach{ case (head, i) =>
-      printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
-        leadersValid(i), head.source, head.address)
+    leaders.zipWithIndex.foreach { case (head, i) =>
+      printf(
+        s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
+        leadersValid(i),
+        head.source,
+        head.address
+      )
    }
  }
  // when (leadersValid.reduce(_ || _)) {
@@ -406,34 +434,42 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
  // Gives a 2-D table of Bools representing match at every queue entry,
  // for each lane (so 3-D in total).
  // dimensions: (leader lane, follower lane, follower entry)
-  val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) =>
-    (io.window.elts zip io.window.mask).map { case (followers, followerValids) =>
-      // compare leader's head against follower's every queue entry
-      (followers zip followerValids.asBools).map { case (follower, followerValid) =>
-        canMatch(follower, followerValid, leader, leaderValid)
-        // FIXME: disabling halving optimization because it does not give the
-        // correct per-lane coalescable indication to the shift queue
-          // // match leader to only followers at lanes >= leader idx
-          // // this halves the number of comparators
-          // if (followerIndex < leaderIndex) false.B
-          // else canMatch(follower, followerValid, leader, leaderValid)
+  val matchTablePerLane = (leaders zip leadersValid).map {
+    case (leader, leaderValid) =>
+      (io.window.elts zip io.window.mask).map {
+        case (followers, followerValids) =>
+          // compare leader's head against follower's every queue entry
+          (followers zip followerValids.asBools).map {
+            case (follower, followerValid) =>
+              canMatch(follower, followerValid, leader, leaderValid)
+            // FIXME: disabling halving optimization because it does not give the
+            // correct per-lane coalescable indication to the shift queue
+            // // match leader to only followers at lanes >= leader idx
+            // // this halves the number of comparators
+            // if (followerIndex < leaderIndex) false.B
+            // else canMatch(follower, followerValid, leader, leaderValid)
+          }
      }
-    }
  }

  val matchCounts = matchTablePerLane.map(table =>
-      table.map(PopCount(_)) // sum up each column
-           .reduce(_ +& _))
+    table
+      .map(PopCount(_)) // sum up each column
+      .reduce(_ +& _)
+  )
  val canCoalesce = matchCounts.map(_ > 1.U)

  // Elect the leader that has the most match counts.
  // TODO: potentially expensive: magnitude comparator
  def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
-    matchCounts.zipWithIndex.map {
-      case (c, i) => (c, i.U)
-    }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
+    matchCounts.zipWithIndex
+      .map { case (c, i) =>
+        (c, i.U)
+      }
+      .reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
        (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
-    }._2
+      }
+      ._2
  }
  // Elect leader by choosing the smallest-index lane that has a valid
  // match, i.e. using priority encoder.
@@ -444,7 +480,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques

  val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux
  // matchTable for the chosen lane, but converted to a Vec[UInt]
-  val chosenMatches = VecInit(matchTablePerLane.map{ table =>
+  val chosenMatches = VecInit(matchTablePerLane.map { table =>
    VecInit(table.map(VecInit(_).asUInt))
  })(chosenLeaderIdx)
  val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)
@@ -452,18 +488,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
  // coverage calculation
  def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordSizeWidth)
  // 2-D table flattened to 1-D
-  val offsets = io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
+  val offsets =
+    io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
  val valids = chosenMatches.flatMap(_.asBools)
  // indicates for each word in the coalesced chunk whether it is accessed by
  // any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four
  // words in the coalesced data coming back will be accessed by some request
  // and we've reached 100% bandwidth utilization.
  val hits = Seq.tabulate(1 << (size - config.wordSizeWidth)) { target =>
-    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+    (offsets zip valids)
+      .map { case (offset, valid) => valid && (offset === target.U) }
+      .reduce(_ || _)
  }

  // debug prints
-  when (leadersValid.reduce(_ || _)) {
+  when(leadersValid.reduce(_ || _)) {
    matchCounts.zipWithIndex.foreach { case (count, i) =>
      printf(s"lane[${i}] matchCount = %d\n", count);
    }
@@ -492,20 +531,26 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques
 // coalesced request out of all possible combinations.
 //
 // Software model: coalescer.py
-class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Request,
-                     config: CoalescerConfig) extends Module {
+class MultiCoalescer(
+    windowT: CoalShiftQueue[NonCoalescedRequest],
+    coalReqT: Request,
+    config: CoalescerConfig
+) extends Module {
  val io = IO(new Bundle {
    // coalescing window, connected to the contents of the request queues
    val window = Input(windowT.io.cloneType)
    // generated coalesced request
    val coalReq = DecoupledIO(coalReqT.cloneType)
    // invalidate signals going into each request queue's head
-    val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
+    val invalidate =
+      Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
    // whether a lane is coalescable
    val coalescable = Output(Vec(config.numLanes, Bool()))
  })

-  val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
+  val coalescers = config.coalLogSizes.map(size =>
+    Module(new MonoCoalescer(size, windowT, config))
+  )
  coalescers.foreach(_.io.window := io.window)

  def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
@@ -530,9 +575,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
  val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
  val chosenValid = Wire(Bool())
  // minimum 25% coverage
-  val minCoverage = 1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
+  val minCoverage =
+    1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))

-  when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
+  when(normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
    chosenSizeIdx := argMax(normalizedHits)
    chosenValid := true.B
    printf("coalescing success by coverage policy\n")
@@ -562,9 +608,14 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
  val flatMatches = chosenBundle.matchOH.flatMap(_.asBools)

  // check for word alignment in addresses
-  assert(io.window.elts.flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U)).zip(
-    io.window.mask.flatMap(_.asBools)).map { case (aligned, valid) => (!valid) || aligned }.reduce(_ || _),
-    "one or more addresses used for coalescing is not word-aligned")
+  assert(
+    io.window.elts
+      .flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U))
+      .zip(io.window.mask.flatMap(_.asBools))
+      .map { case (aligned, valid) => (!valid) || aligned }
+      .reduce(_ || _),
+    "one or more addresses used for coalescing is not word-aligned"
+  )

  // note: this is word-level coalescing. if finer granularity is needed, need to modify code
  val numWords = (1.U << (chosenSize - config.wordSizeWidth.U)).asUInt
@@ -579,18 +630,29 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
    val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
      // note: ANDing against addrMask is to conform to active byte lanes requirements
      // if aligning to LSB suffices, we should add the bitwise AND back
-      m && ((req.address(config.maxCoalLogSize - 1, config.wordSizeWidth)/* & addrMask*/) === i.U)
+      m && ((req.address(
+        config.maxCoalLogSize - 1,
+        config.wordSizeWidth
+      ) /* & addrMask*/ ) === i.U)
    }
    // TODO: SW uses priority encoder, not sure about behavior of MuxCase
-    data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) =>
-      s -> req.data
-    })
-    mask(i) := MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) =>
-      s -> req.mask
-    })
+    data(i) := MuxCase(
+      DontCare,
+      flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.data
+      }
+    )
+    mask(i) := MuxCase(
+      0.U,
+      flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.mask
+      }
+    )
  }

-  val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds)))
+  val sourceGen = Module(
+    new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds))
+  )
  sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created
  sourceGen.io.reclaim.valid := false.B // not used
  sourceGen.io.reclaim.bits := DontCare // not used
@@ -608,7 +670,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
  io.invalidate.bits := chosenBundle.matchOH
  io.invalidate.valid := io.coalReq.fire // invalidate only when fire

-  io.coalescable := coalescers.map(_.io.results.canCoalesce.asUInt).reduce(_ | _).asBools
+  io.coalescable := coalescers
+    .map(_.io.results.canCoalesce.asUInt)
+    .reduce(_ | _)
+    .asBools

  dontTouch(io.invalidate) // debug

@@ -620,21 +685,30 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req
  if (!config.enable) disable
 }

-class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) {
-  require(outer.cpuNode.in.length == config.numLanes,
+class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
+    extends LazyModuleImp(outer) {
+  require(
+    outer.cpuNode.in.length == config.numLanes,
    s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " +
-    s"config.numLanes (${config.numLanes})")
-  require(outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
+      s"config.numLanes (${config.numLanes})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
    s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " +
-    s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})")
-  require(outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
+      s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})"
+  )
+  require(
+    outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
    s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
-    s"mismatch with config.addressWidth (${config.addressWidth})")
+      s"mismatch with config.addressWidth (${config.addressWidth})"
+  )

  val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
  // note we are using word size. assuming all coalescer inputs are word sized
  val reqQueueEntryT = new NonCoalescedRequest(config)
-  val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config))
+  val reqQueues = Module(
+    new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config)
+  )

  val coalReqT = new CoalescedRequest(config)
  val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
@@ -710,7 +784,6 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
  tlCoal.e.valid := false.B

-
  // ===========================================================================
  // Response flow
  // ===========================================================================
@@ -723,8 +796,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  val numPerLaneReqs = config.queueDepth

  // FIXME: no need to contain maxCoalLogSize data
-  val respQueueEntryT = new Response(oldSourceWidth, log2Ceil(config.maxCoalLogSize),
-    (1 << config.maxCoalLogSize) * 8)
+  val respQueueEntryT = new Response(
+    oldSourceWidth,
+    log2Ceil(config.maxCoalLogSize),
+    (1 << config.maxCoalLogSize) * 8
+  )
  val respQueues = Seq.tabulate(config.numLanes) { _ =>
    Module(
      new MultiPortQueue(
@@ -810,12 +886,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  val newEntry = Wire(uncoalescer.inflightTable.entryT)
  newEntry.source := coalescer.io.coalReq.bits.source

-  assert (config.maxCoalLogSize <= config.dataBusWidth,
-    "multi-beat coalesced reads/writes are currently not supported")
-  assert (
+  assert(
+    config.maxCoalLogSize <= config.dataBusWidth,
+    "multi-beat coalesced reads/writes are currently not supported"
+  )
+  assert(
    tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
    s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
-    + s" (${(1 << config.dataBusWidth) * 8})"
+      + s" (${(1 << config.dataBusWidth) * 8})"
  )
  val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
  // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
@@ -825,8 +903,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
      (laneEntry.reqs zip laneInv.asBools).zipWithIndex
        .foreach { case ((reqEntry, inv), i) =>
          val req = reqQueues.io.elts(lane)(i)
-          when ((coalescer.io.invalidate.valid && inv)) {
-            printf(s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", req.source)
+          when((coalescer.io.invalidate.valid && inv)) {
+            printf(
+              s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
+              req.source
+            )
          }
          reqEntry.valid := (coalescer.io.invalidate.valid && inv)
          reqEntry.source := req.source
@@ -845,22 +926,23 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
  tlCoal.d.ready := uncoalescer.io.coalResp.ready

  // Connect uncoalescer results back into each lane's response queue
-  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) =>
-    perLaneResps.zipWithIndex.foreach { case (resp, i) =>
-      // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
-      // cache.  This should ideally not happen though.
-      assert(
-        q.io.enq(respQueueUncoalPortOffset + i).ready,
-        s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
-      )
-      q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
-      q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
+  (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach {
+    case ((q, perLaneResps), lane) =>
+      perLaneResps.zipWithIndex.foreach { case (resp, i) =>
+        // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
+        // cache.  This should ideally not happen though.
+        assert(
+          q.io.enq(respQueueUncoalPortOffset + i).ready,
+          s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
+        )
+        q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
+        q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
      // debug
      // when (resp.valid) {
      //   printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
      // }
      // dontTouch(q.io.enq(respQueueCoalPortOffset))
-    }
+      }
  }

  // Debug
@@ -972,7 +1054,8 @@ class Uncoalescer(config: CoalescerConfig) extends Module {
 // split the coalesced response back to individual per-lane responses with the
 // right metadata.
 class InflightCoalReqTable(config: CoalescerConfig) extends Module {
-  val offsetBits = config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
+  val offsetBits =
+    config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
  val entryT = new InflightCoalReqTableEntry(
    config.numLanes,
    config.queueDepth,
@@ -1019,7 +1102,7 @@ class InflightCoalReqTable(config: CoalescerConfig) extends Module {
  }

  val full = Wire(Bool())
-  full := (0 until entries).map( table(_).valid ).reduce( _ && _ )
+  full := (0 until entries).map(table(_).valid).reduce(_ && _)
  assert(!full, "inflight table is full and blocking coalescer")
  dontTouch(full)

@@ -1094,8 +1177,12 @@ object TLUtils {
 // `traceHasSource` is true if the input trace file has an additional source
 // ID column.  This is useful for using the output trace file genereated by
 // MemTraceLogger as the driver.
-class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource: Boolean = false)
-  (implicit p: Parameters) extends LazyModule {
+class MemTraceDriver(
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean = false
+)(implicit p: Parameters)
+    extends LazyModule {
  // Create N client nodes together
  val laneNodes = Seq.tabulate(config.numLanes) { i =>
    val clientParam = Seq(
@@ -1113,7 +1200,8 @@ class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource:
  val node = TLIdentityNode()
  laneNodes.foreach { l => node := l }

-  lazy val module = new MemTraceDriverImp(this, config, filename, traceHasSource)
+  lazy val module =
+    new MemTraceDriverImp(this, config, filename, traceHasSource)
 }

 trait HasTraceLine {
@@ -1136,9 +1224,12 @@ class TraceLine extends Bundle with HasTraceLine {
  val data = UInt(64.W)
 }

-class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename: String,
-  traceHasSource: Boolean)
-    extends LazyModuleImp(outer)
+class MemTraceDriverImp(
+    outer: MemTraceDriver,
+    config: CoalescerConfig,
+    filename: String,
+    traceHasSource: Boolean
+) extends LazyModuleImp(outer)
    with UnitTestModule {
  // Current cycle mark to read from trace
  val traceReadCycle = RegInit(1.U(64.W))
@@ -1176,7 +1267,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename

  // Not all fire because trace cycle has to advance even when there is no valid
  // line in the trace.
-  when (reqQueueAllReady){
+  when(reqQueueAllReady) {
    traceReadCycle := traceReadCycle + 1.U
  }

@@ -1216,11 +1307,16 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
    sizeInBytes := (1.U) << req.size
    mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
    wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
-    val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
+    val wordAlignedAddress =
+      req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
    val wordAlignedSize = Mux(subword, 2.U, req.size)

-    val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numOldSrcIds),
-      ignoreInUse = false))
+    val sourceGen = Module(
+      new RoundRobinSourceGenerator(
+        log2Ceil(config.numOldSrcIds),
+        ignoreInUse = false
+      )
+    )
    sourceGen.io.gen := reqQ.io.deq.fire
    // assert(sourceGen.io.id.valid)

@@ -1229,7 +1325,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
      toAddress = hashToValidPhyAddr(wordAlignedAddress),
      lgSize = wordAlignedSize, // trace line already holds log2(size)
      // data should be aligned to beatBytes
-      data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
+      data =
+        (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
    )
    val (glegal, gbits) = edge.Get(
      fromSource = sourceGen.io.id.bits,
@@ -1240,7 +1337,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename
    val bits = Mux(req.is_store, pbits, gbits)

    tlOut.a.valid := (reqQ.io.deq.valid && sourceGen.io.id.valid)
-    when (tlOut.a.valid) {
+    when(tlOut.a.valid) {
      assert(legal, "illegal TL req gen")
    }
    tlOut.a.bits := bits
@@ -1288,9 +1385,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename

 class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
    extends BlackBox(
-      Map("FILENAME" -> filename,
-          "NUM_LANES" -> numLanes,
-          "HAS_SOURCE" -> (if (traceHasSource) 1 else 0))
+      Map(
+        "FILENAME" -> filename,
+        "NUM_LANES" -> numLanes,
+        "HAS_SOURCE" -> (if (traceHasSource) 1 else 0)
+      )
    )
    with HasBlackBoxResource {
  val traceLineT = new TraceLine
@@ -1304,19 +1403,20 @@ class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)

    // These names have to match declarations in the Verilog code, eg.
    // trace_read_address.
-    val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source
-      val ready = Input(Bool())
-      val valid = Output(UInt(numLanes.W))
-      // Chisel can't interface with Verilog 2D port, so flatten all lanes into
-      // single wide 1D array.
-      // TODO: assumes 64-bit address.
-      val cycle = Input(UInt(64.W))
-      val address = Output(UInt((addrW * numLanes).W))
-      val is_store = Output(UInt(numLanes.W))
-      val size = Output(UInt((sizeW * numLanes).W))
-      val data = Output(UInt((dataW * numLanes).W))
-      val finished = Output(Bool())
-    }
+    val trace_read =
+      new Bundle { // can't use HasTraceLine because this doesn't have source
+        val ready = Input(Bool())
+        val valid = Output(UInt(numLanes.W))
+        // Chisel can't interface with Verilog 2D port, so flatten all lanes into
+        // single wide 1D array.
+        // TODO: assumes 64-bit address.
+        val cycle = Input(UInt(64.W))
+        val address = Output(UInt((addrW * numLanes).W))
+        val is_store = Output(UInt(numLanes.W))
+        val size = Output(UInt((sizeW * numLanes).W))
+        val data = Output(UInt((dataW * numLanes).W))
+        val finished = Output(Bool())
+      }
  })

  addResource("/vsrc/SimMemTrace.v")
@@ -1443,11 +1543,11 @@ class MemTraceLogger(

        // This assert only holds true for PutFullData and not PutPartialData,
        // where HIGH bits in the mask may not be contiguous.
-        when (tlIn.a.valid) {
+        when(tlIn.a.valid) {
          assert(
            PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
            "mask HIGH popcount do not match the TL size. " +
-            "Partial masks are not allowed for PutFull"
+              "Partial masks are not allowed for PutFull"
          )
        }
        val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
@@ -1476,17 +1576,25 @@ class MemTraceLogger(

    // stats
    val numReqsThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
    val numRespsThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
+      laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
+        (v0, v1) => v0 + v1
+      }
    val reqBytesThisCycle =
-      laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
-        b0 + b1
-      }
+      laneReqs
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
+          b0 + b1
+        }
    val respBytesThisCycle =
-      laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
-        b0 + b1
-      }
+      laneResps
+        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .reduce { (b0, b1) =>
+          b0 + b1
+        }
    numReqs := numReqs + numReqsThisCycle
    numResps := numResps + numRespsThisCycle
    reqBytes := reqBytes + reqBytesThisCycle
@@ -1496,7 +1604,10 @@ class MemTraceLogger(
    //
    // This is a clunky workaround of the fact that Chisel doesn't allow partial
    // assignment to a bitfield range of a wide signal.
-    def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
+    def flattenTrace(
+        simIO: Bundle with HasTraceLine,
+        perLane: Vec[TraceLine]
+    ) = {
      // these will get optimized out
      val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
      val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
@@ -1592,8 +1703,14 @@ object TLPrintf {
      tlData: UInt,
      reqData: UInt
  ) = {
-    printf(s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
-      source, address, size, mask, is_store)
+    printf(
+      s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
+      source,
+      address,
+      size,
+      mask,
+      is_store
+    )
    when(is_store) {
      printf(", tlData=%x, reqData=%x", tlData, reqData)
    }
@@ -1604,7 +1721,7 @@ object TLPrintf {
 // Synthesizable unit tests

 class DummyDriver(config: CoalescerConfig)(implicit p: Parameters)
-  extends LazyModule {
+    extends LazyModule {
  val laneNodes = Seq.tabulate(config.numLanes) { i =>
    val clientParam = Seq(
      TLMasterParameters.v1(
@@ -1640,7 +1757,10 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
    // generate dummy traffic to coalescer to prevent it from being optimized
    // out during synthesis
    val address = Wire(UInt(config.addressWidth.W))
-    address := Cat((finishCounter + (lane.U % 3.U)), 0.U(config.wordSizeWidth.W))
+    address := Cat(
+      (finishCounter + (lane.U % 3.U)),
+      0.U(config.wordSizeWidth.W)
+    )
    val (tl, edge) = node.out(0)
    val (legal, bits) = edge.Put(
      fromSource = sourceIdCounter,
@@ -1657,11 +1777,13 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
    tl.e.valid := false.B
  }

-  val dataSum = outer.laneNodes.map { node =>
-    val tl = node.out(0)._1
-    val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
-    data
-  }.reduce (_ +& _)
+  val dataSum = outer.laneNodes
+    .map { node =>
+      val tl = node.out(0)._1
+      val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
+      data
+    }
+    .reduce(_ +& _)
  // this doesn't make much sense, but it prevents the entire uncoalescer from
  // being optimized away
  finishCounter := finishCounter + dataSum
@@ -1680,8 +1802,10 @@ class DummyCoalescer(implicit p: Parameters) extends LazyModule {
      // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
      // edges globally, by way of Diplomacy communicating the TL slave
      // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
    )
  )

@@ -1704,7 +1828,8 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
 }

 // tracedriver --> coalescer --> tracelogger --> tlram
-class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule {
+class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters)
+    extends LazyModule {
  val numLanes = p(SIMTCoreKey).get.nLanes
  val config = defaultConfig.copy(numLanes = numLanes)

@@ -1713,14 +1838,18 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
    new MemTraceLogger(numLanes, filename, loggerName = "coreside")
  )
  val coal = LazyModule(new CoalescingUnit(config))
-  val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside"))
+  val memSideLogger = LazyModule(
+    new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")
+  )
  val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
    LazyModule(
      // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
      // edges globally, by way of Diplomacy communicating the TL slave
      // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << config.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << config.dataBusWidth)
+      )
    )
  )

@@ -1751,8 +1880,9 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz
  }
 }

-class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters)
-    extends UnitTest(timeout) {
+class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit
+    p: Parameters
+) extends UnitTest(timeout) {
  val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
  dut.io.start := io.start
  io.finished := dut.io.finished
@@ -1770,8 +1900,10 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
      // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
      // edges globally, by way of Diplomacy communicating the TL slave
      // parameters to the upstream nodes.
-      new TLRAM(address = AddressSet(0x0000, 0xffffff),
-        beatBytes = (1 << defaultConfig.dataBusWidth))
+      new TLRAM(
+        address = AddressSet(0x0000, 0xffffff),
+        beatBytes = (1 << defaultConfig.dataBusWidth)
+      )
    )
  )

@@ -1785,13 +1917,13 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
  }
 }

-class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) {
+class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
+    extends UnitTest(timeout) {
  val dut = Module(LazyModule(new TLRAMCoalescer).module)
  dut.io.start := io.start
  io.finished := dut.io.finished
 }

-
 ////////////
 ////////////
 ////////////
@@ -1941,11 +2073,3 @@ class CoalescerXbarImpl(outer: CoalescerXbar,


  }
-
-
-
-
-
-
-
-