From 0c8909cb43d3d48a9a2f59736e943831e914982b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 11 May 2023 16:11:39 -0700 Subject: [PATCH] scalafmt --- src/main/scala/tilelink/Coalescing.scala | 480 ++++++++++++++--------- 1 file changed, 302 insertions(+), 178 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 373337a..0e72dae 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -152,7 +152,7 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In fromSource = this.source, toAddress = this.address, lgSize = this.size, - data = this.data, + data = this.data ) val (glegal, gbits) = edgeOut.Get( fromSource = this.source, @@ -166,17 +166,22 @@ class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: In } } case class NonCoalescedRequest(config: CoalescerConfig) -extends Request(sourceWidth = log2Ceil(config.numOldSrcIds), - sizeWidth = config.wordSizeWidth, - addressWidth = config.addressWidth, - dataWidth = config.wordSizeInBytes * 8) + extends Request( + sourceWidth = log2Ceil(config.numOldSrcIds), + sizeWidth = config.wordSizeWidth, + addressWidth = config.addressWidth, + dataWidth = config.wordSizeInBytes * 8 + ) case class CoalescedRequest(config: CoalescerConfig) -extends Request(sourceWidth = log2Ceil(config.numNewSrcIds), - sizeWidth = log2Ceil(config.maxCoalLogSize), - addressWidth = config.addressWidth, - dataWidth = (8 * (1 << config.maxCoalLogSize))) + extends Request( + sourceWidth = log2Ceil(config.numNewSrcIds), + sizeWidth = log2Ceil(config.maxCoalLogSize), + addressWidth = config.addressWidth, + dataWidth = (8 * (1 << config.maxCoalLogSize)) + ) -class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle { +class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) + extends Bundle { val op = UInt(1.W) // 0=READ 1=WRITE val size = UInt(sizeWidth.W) val source = UInt(sourceWidth.W) @@ -205,17 +210,22 @@ class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle } } case class NonCoalescedResponse(config: CoalescerConfig) -extends Response(sourceWidth = log2Ceil(config.numOldSrcIds), - sizeWidth = config.wordSizeWidth, - dataWidth = config.wordSizeInBytes * 8) + extends Response( + sourceWidth = log2Ceil(config.numOldSrcIds), + sizeWidth = config.wordSizeWidth, + dataWidth = config.wordSizeInBytes * 8 + ) case class CoalescedResponse(config: CoalescerConfig) -extends Response(sourceWidth = log2Ceil(config.numNewSrcIds), - sizeWidth = log2Ceil(config.maxCoalLogSize), - dataWidth = (8 * (1 << config.maxCoalLogSize))) + extends Response( + sourceWidth = log2Ceil(config.numNewSrcIds), + sizeWidth = log2Ceil(config.maxCoalLogSize), + dataWidth = (8 * (1 << config.maxCoalLogSize)) + ) // If `ignoreInUse`, just keep giving out new IDs without checking if it is in // use. -class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module { +class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) + extends Module { val io = IO(new Bundle { val gen = Input(Bool()) val reclaim = Input(Valid(UInt(sourceWidth.W))) @@ -234,15 +244,16 @@ class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) e io.id.valid := (if (ignoreInUse) true.B else !occupancyTable(head).valid) io.id.bits := head - when (io.gen && io.id.valid /* fire */) { + when(io.gen && io.id.valid /* fire */ ) { occupancyTable(io.id.bits).valid := true.B // mark in use } - when (io.reclaim.valid) { + when(io.reclaim.valid) { occupancyTable(io.reclaim.bits).valid := false.B // mark freed } } -class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) extends Module { +class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) + extends Module { val io = IO(new Bundle { val queue = new Bundle { val enq = Vec(config.numLanes, DeqIO(gen.cloneType)) @@ -259,7 +270,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e // eltPrototype.valid := false.B val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen)))) - val writePtr = RegInit(VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W)))) + val writePtr = RegInit( + VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))) + ) val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B))) private def resetElts = { @@ -270,7 +283,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e } } } - when (reset.asBool) { + when(reset.asBool) { resetElts } @@ -286,14 +299,17 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e // current cycle. // // shift hint is when the heads have no more coalescable left this or next cycle - val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) => - c && !(io.invalidate.valid && inv) - }.reduce(_ || _) + val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))) + .map { case (c, inv) => + c && !(io.invalidate.valid && inv) + } + .reduce(_ || _) val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _) // valid && !fire means we enable enqueueing to a full queue, provided the // arbiter is taking away all remaining valid queue heads in the next cycle so // that we make space for the entire next warp. - val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) + val syncedDeqValidNextCycle = + io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) for (i <- 0 until config.numLanes) { val enq = io.queue.enq(i) @@ -313,20 +329,22 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e // can take new entries if not empty, or if full but shifting enq.ready := (!ctrl.full) || ctrl.shift - when (ctrl.shift) { + when(ctrl.shift) { // shift, invalidate tail, invalidate coalesced requests elts(i).zipWithIndex.foreach { case (elt, j) => if (j == entries - 1) { // tail elt.valid := false.B } else { elt.bits := elts(i)(j + 1).bits - elt.valid := elts(i)(j + 1).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1)) + elt.valid := elts(i)( + j + 1 + ).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1)) } } // reset dequeue mask when new entries are shifted in deqDone(i) := false.B // enqueue - when (enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire + when(enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire elts(i)(writePtr(i) - 1.U).bits := enq.bits elts(i)(writePtr(i) - 1.U).valid := enq.valid }.otherwise { @@ -334,13 +352,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e } }.otherwise { // invalidate coalesced requests - when (io.invalidate.valid) { + when(io.invalidate.valid) { (elts(i) zip io.invalidate.bits(i).asBools).map { case (elt, inv) => elt.valid := elt.valid && !inv } } // enqueue - when (enq.ready && syncedEnqValid) { + when(enq.ready && syncedEnqValid) { elts(i)(writePtr(i)).bits := enq.bits elts(i)(writePtr(i)).valid := enq.valid writePtr(i) := writePtr(i) + 1.U @@ -352,8 +370,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e // When doing spatial-only coalescing, queues should never drift from each // other, i.e. the queue heads should always contain mem requests from the // same instruction. - val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) && - writePtr.map(_ === writePtr.head).reduce(_ && _) + val queueInSync = + controlSignals.map(_ === controlSignals.head).reduce(_ && _) && + writePtr.map(_ === writePtr.head).reduce(_ && _) assert(queueInSync, "shift queue lanes are not in sync") io.mask := elts.map(x => VecInit(x.map(_.valid)).asUInt) @@ -361,8 +380,11 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e } // Software model: coalescer.py -class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedRequest], - config: CoalescerConfig) extends Module { +class MonoCoalescer( + coalLogSize: Int, + windowT: CoalShiftQueue[NonCoalescedRequest], + config: CoalescerConfig +) extends Module { val io = IO(new Bundle { val window = Input(windowT.io.cloneType) val results = Output(new Bundle { @@ -371,8 +393,10 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W))) // number of entries matched with this leader lane's head. // maximum is numLanes * queueDepth - val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W)) - val coverageHits = Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W)) + val matchCount = + Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W)) + val coverageHits = + Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W)) val canCoalesce = Output(Vec(config.numLanes, Bool())) }) }) @@ -386,9 +410,13 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques val leadersValid = io.window.mask.map(_.asBools.head) def printQueueHeads = { - leaders.zipWithIndex.foreach{ case (head, i) => - printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n", - leadersValid(i), head.source, head.address) + leaders.zipWithIndex.foreach { case (head, i) => + printf( + s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n", + leadersValid(i), + head.source, + head.address + ) } } // when (leadersValid.reduce(_ || _)) { @@ -406,34 +434,42 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques // Gives a 2-D table of Bools representing match at every queue entry, // for each lane (so 3-D in total). // dimensions: (leader lane, follower lane, follower entry) - val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) => - (io.window.elts zip io.window.mask).map { case (followers, followerValids) => - // compare leader's head against follower's every queue entry - (followers zip followerValids.asBools).map { case (follower, followerValid) => - canMatch(follower, followerValid, leader, leaderValid) - // FIXME: disabling halving optimization because it does not give the - // correct per-lane coalescable indication to the shift queue - // // match leader to only followers at lanes >= leader idx - // // this halves the number of comparators - // if (followerIndex < leaderIndex) false.B - // else canMatch(follower, followerValid, leader, leaderValid) + val matchTablePerLane = (leaders zip leadersValid).map { + case (leader, leaderValid) => + (io.window.elts zip io.window.mask).map { + case (followers, followerValids) => + // compare leader's head against follower's every queue entry + (followers zip followerValids.asBools).map { + case (follower, followerValid) => + canMatch(follower, followerValid, leader, leaderValid) + // FIXME: disabling halving optimization because it does not give the + // correct per-lane coalescable indication to the shift queue + // // match leader to only followers at lanes >= leader idx + // // this halves the number of comparators + // if (followerIndex < leaderIndex) false.B + // else canMatch(follower, followerValid, leader, leaderValid) + } } - } } val matchCounts = matchTablePerLane.map(table => - table.map(PopCount(_)) // sum up each column - .reduce(_ +& _)) + table + .map(PopCount(_)) // sum up each column + .reduce(_ +& _) + ) val canCoalesce = matchCounts.map(_ > 1.U) // Elect the leader that has the most match counts. // TODO: potentially expensive: magnitude comparator def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = { - matchCounts.zipWithIndex.map { - case (c, i) => (c, i.U) - }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => + matchCounts.zipWithIndex + .map { case (c, i) => + (c, i.U) + } + .reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j)) - }._2 + } + ._2 } // Elect leader by choosing the smallest-index lane that has a valid // match, i.e. using priority encoder. @@ -444,7 +480,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux // matchTable for the chosen lane, but converted to a Vec[UInt] - val chosenMatches = VecInit(matchTablePerLane.map{ table => + val chosenMatches = VecInit(matchTablePerLane.map { table => VecInit(table.map(VecInit(_).asUInt)) })(chosenLeaderIdx) val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx) @@ -452,18 +488,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques // coverage calculation def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordSizeWidth) // 2-D table flattened to 1-D - val offsets = io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address))) + val offsets = + io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address))) val valids = chosenMatches.flatMap(_.asBools) // indicates for each word in the coalesced chunk whether it is accessed by // any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four // words in the coalesced data coming back will be accessed by some request // and we've reached 100% bandwidth utilization. val hits = Seq.tabulate(1 << (size - config.wordSizeWidth)) { target => - (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _) + (offsets zip valids) + .map { case (offset, valid) => valid && (offset === target.U) } + .reduce(_ || _) } // debug prints - when (leadersValid.reduce(_ || _)) { + when(leadersValid.reduce(_ || _)) { matchCounts.zipWithIndex.foreach { case (count, i) => printf(s"lane[${i}] matchCount = %d\n", count); } @@ -492,20 +531,26 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[NonCoalescedReques // coalesced request out of all possible combinations. // // Software model: coalescer.py -class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Request, - config: CoalescerConfig) extends Module { +class MultiCoalescer( + windowT: CoalShiftQueue[NonCoalescedRequest], + coalReqT: Request, + config: CoalescerConfig +) extends Module { val io = IO(new Bundle { // coalescing window, connected to the contents of the request queues val window = Input(windowT.io.cloneType) // generated coalesced request val coalReq = DecoupledIO(coalReqT.cloneType) // invalidate signals going into each request queue's head - val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))) + val invalidate = + Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))) // whether a lane is coalescable val coalescable = Output(Vec(config.numLanes, Bool())) }) - val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config))) + val coalescers = config.coalLogSizes.map(size => + Module(new MonoCoalescer(size, windowT, config)) + ) coalescers.foreach(_.io.window := io.window) def normalize(valPerSize: Seq[UInt]): Seq[UInt] = { @@ -530,9 +575,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W)) val chosenValid = Wire(Bool()) // minimum 25% coverage - val minCoverage = 1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2)) + val minCoverage = + 1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2)) - when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) { + when(normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) { chosenSizeIdx := argMax(normalizedHits) chosenValid := true.B printf("coalescing success by coverage policy\n") @@ -562,9 +608,14 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req val flatMatches = chosenBundle.matchOH.flatMap(_.asBools) // check for word alignment in addresses - assert(io.window.elts.flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U)).zip( - io.window.mask.flatMap(_.asBools)).map { case (aligned, valid) => (!valid) || aligned }.reduce(_ || _), - "one or more addresses used for coalescing is not word-aligned") + assert( + io.window.elts + .flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U)) + .zip(io.window.mask.flatMap(_.asBools)) + .map { case (aligned, valid) => (!valid) || aligned } + .reduce(_ || _), + "one or more addresses used for coalescing is not word-aligned" + ) // note: this is word-level coalescing. if finer granularity is needed, need to modify code val numWords = (1.U << (chosenSize - config.wordSizeWidth.U)).asUInt @@ -579,18 +630,29 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req val sel = flatReqs.zip(flatMatches).map { case (req, m) => // note: ANDing against addrMask is to conform to active byte lanes requirements // if aligning to LSB suffices, we should add the bitwise AND back - m && ((req.address(config.maxCoalLogSize - 1, config.wordSizeWidth)/* & addrMask*/) === i.U) + m && ((req.address( + config.maxCoalLogSize - 1, + config.wordSizeWidth + ) /* & addrMask*/ ) === i.U) } // TODO: SW uses priority encoder, not sure about behavior of MuxCase - data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) => - s -> req.data - }) - mask(i) := MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) => - s -> req.mask - }) + data(i) := MuxCase( + DontCare, + flatReqs.zip(sel).map { case (req, s) => + s -> req.data + } + ) + mask(i) := MuxCase( + 0.U, + flatReqs.zip(sel).map { case (req, s) => + s -> req.mask + } + ) } - val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds))) + val sourceGen = Module( + new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds)) + ) sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created sourceGen.io.reclaim.valid := false.B // not used sourceGen.io.reclaim.bits := DontCare // not used @@ -608,7 +670,10 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req io.invalidate.bits := chosenBundle.matchOH io.invalidate.valid := io.coalReq.fire // invalidate only when fire - io.coalescable := coalescers.map(_.io.results.canCoalesce.asUInt).reduce(_ | _).asBools + io.coalescable := coalescers + .map(_.io.results.canCoalesce.asUInt) + .reduce(_ | _) + .asBools dontTouch(io.invalidate) // debug @@ -620,21 +685,30 @@ class MultiCoalescer(windowT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Req if (!config.enable) disable } -class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) { - require(outer.cpuNode.in.length == config.numLanes, +class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) + extends LazyModuleImp(outer) { + require( + outer.cpuNode.in.length == config.numLanes, s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " + - s"config.numLanes (${config.numLanes})") - require(outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds), + s"config.numLanes (${config.numLanes})" + ) + require( + outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds), s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " + - s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})") - require(outer.cpuNode.in.head._1.params.addressBits == config.addressWidth, + s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})" + ) + require( + outer.cpuNode.in.head._1.params.addressBits == config.addressWidth, s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " + - s"mismatch with config.addressWidth (${config.addressWidth})") + s"mismatch with config.addressWidth (${config.addressWidth})" + ) val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits // note we are using word size. assuming all coalescer inputs are word sized val reqQueueEntryT = new NonCoalescedRequest(config) - val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config)) + val reqQueues = Module( + new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config) + ) val coalReqT = new CoalescedRequest(config) val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config)) @@ -710,7 +784,6 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below. tlCoal.e.valid := false.B - // =========================================================================== // Response flow // =========================================================================== @@ -723,8 +796,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends val numPerLaneReqs = config.queueDepth // FIXME: no need to contain maxCoalLogSize data - val respQueueEntryT = new Response(oldSourceWidth, log2Ceil(config.maxCoalLogSize), - (1 << config.maxCoalLogSize) * 8) + val respQueueEntryT = new Response( + oldSourceWidth, + log2Ceil(config.maxCoalLogSize), + (1 << config.maxCoalLogSize) * 8 + ) val respQueues = Seq.tabulate(config.numLanes) { _ => Module( new MultiPortQueue( @@ -810,12 +886,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends val newEntry = Wire(uncoalescer.inflightTable.entryT) newEntry.source := coalescer.io.coalReq.bits.source - assert (config.maxCoalLogSize <= config.dataBusWidth, - "multi-beat coalesced reads/writes are currently not supported") - assert ( + assert( + config.maxCoalLogSize <= config.dataBusWidth, + "multi-beat coalesced reads/writes are currently not supported" + ) + assert( tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8, s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant" - + s" (${(1 << config.dataBusWidth) * 8})" + + s" (${(1 << config.dataBusWidth) * 8})" ) val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits) // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the @@ -825,8 +903,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends (laneEntry.reqs zip laneInv.asBools).zipWithIndex .foreach { case ((reqEntry, inv), i) => val req = reqQueues.io.elts(lane)(i) - when ((coalescer.io.invalidate.valid && inv)) { - printf(s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", req.source) + when((coalescer.io.invalidate.valid && inv)) { + printf( + s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", + req.source + ) } reqEntry.valid := (coalescer.io.invalidate.valid && inv) reqEntry.source := req.source @@ -845,22 +926,23 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends tlCoal.d.ready := uncoalescer.io.coalResp.ready // Connect uncoalescer results back into each lane's response queue - (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) => - perLaneResps.zipWithIndex.foreach { case (resp, i) => - // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream - // cache. This should ideally not happen though. - assert( - q.io.enq(respQueueUncoalPortOffset + i).ready, - s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}" - ) - q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid - q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits + (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { + case ((q, perLaneResps), lane) => + perLaneResps.zipWithIndex.foreach { case (resp, i) => + // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream + // cache. This should ideally not happen though. + assert( + q.io.enq(respQueueUncoalPortOffset + i).ready, + s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}" + ) + q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid + q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits // debug // when (resp.valid) { // printf(s"${i}-th uncoalesced response came back from lane ${lane}\n") // } // dontTouch(q.io.enq(respQueueCoalPortOffset)) - } + } } // Debug @@ -972,7 +1054,8 @@ class Uncoalescer(config: CoalescerConfig) extends Module { // split the coalesced response back to individual per-lane responses with the // right metadata. class InflightCoalReqTable(config: CoalescerConfig) extends Module { - val offsetBits = config.maxCoalLogSize - config.wordSizeWidth // assumes word offset + val offsetBits = + config.maxCoalLogSize - config.wordSizeWidth // assumes word offset val entryT = new InflightCoalReqTableEntry( config.numLanes, config.queueDepth, @@ -1019,7 +1102,7 @@ class InflightCoalReqTable(config: CoalescerConfig) extends Module { } val full = Wire(Bool()) - full := (0 until entries).map( table(_).valid ).reduce( _ && _ ) + full := (0 until entries).map(table(_).valid).reduce(_ && _) assert(!full, "inflight table is full and blocking coalescer") dontTouch(full) @@ -1094,8 +1177,12 @@ object TLUtils { // `traceHasSource` is true if the input trace file has an additional source // ID column. This is useful for using the output trace file genereated by // MemTraceLogger as the driver. -class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource: Boolean = false) - (implicit p: Parameters) extends LazyModule { +class MemTraceDriver( + config: CoalescerConfig, + filename: String, + traceHasSource: Boolean = false +)(implicit p: Parameters) + extends LazyModule { // Create N client nodes together val laneNodes = Seq.tabulate(config.numLanes) { i => val clientParam = Seq( @@ -1113,7 +1200,8 @@ class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource: val node = TLIdentityNode() laneNodes.foreach { l => node := l } - lazy val module = new MemTraceDriverImp(this, config, filename, traceHasSource) + lazy val module = + new MemTraceDriverImp(this, config, filename, traceHasSource) } trait HasTraceLine { @@ -1136,9 +1224,12 @@ class TraceLine extends Bundle with HasTraceLine { val data = UInt(64.W) } -class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename: String, - traceHasSource: Boolean) - extends LazyModuleImp(outer) +class MemTraceDriverImp( + outer: MemTraceDriver, + config: CoalescerConfig, + filename: String, + traceHasSource: Boolean +) extends LazyModuleImp(outer) with UnitTestModule { // Current cycle mark to read from trace val traceReadCycle = RegInit(1.U(64.W)) @@ -1176,7 +1267,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename // Not all fire because trace cycle has to advance even when there is no valid // line in the trace. - when (reqQueueAllReady){ + when(reqQueueAllReady) { traceReadCycle := traceReadCycle + 1.U } @@ -1216,11 +1307,16 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename sizeInBytes := (1.U) << req.size mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U) wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data) - val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W) + val wordAlignedAddress = + req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W) val wordAlignedSize = Mux(subword, 2.U, req.size) - val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numOldSrcIds), - ignoreInUse = false)) + val sourceGen = Module( + new RoundRobinSourceGenerator( + log2Ceil(config.numOldSrcIds), + ignoreInUse = false + ) + ) sourceGen.io.gen := reqQ.io.deq.fire // assert(sourceGen.io.id.valid) @@ -1229,7 +1325,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename toAddress = hashToValidPhyAddr(wordAlignedAddress), lgSize = wordAlignedSize, // trace line already holds log2(size) // data should be aligned to beatBytes - data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt + data = + (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt ) val (glegal, gbits) = edge.Get( fromSource = sourceGen.io.id.bits, @@ -1240,7 +1337,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename val bits = Mux(req.is_store, pbits, gbits) tlOut.a.valid := (reqQ.io.deq.valid && sourceGen.io.id.valid) - when (tlOut.a.valid) { + when(tlOut.a.valid) { assert(legal, "illegal TL req gen") } tlOut.a.bits := bits @@ -1288,9 +1385,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean) extends BlackBox( - Map("FILENAME" -> filename, - "NUM_LANES" -> numLanes, - "HAS_SOURCE" -> (if (traceHasSource) 1 else 0)) + Map( + "FILENAME" -> filename, + "NUM_LANES" -> numLanes, + "HAS_SOURCE" -> (if (traceHasSource) 1 else 0) + ) ) with HasBlackBoxResource { val traceLineT = new TraceLine @@ -1304,19 +1403,20 @@ class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean) // These names have to match declarations in the Verilog code, eg. // trace_read_address. - val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source - val ready = Input(Bool()) - val valid = Output(UInt(numLanes.W)) - // Chisel can't interface with Verilog 2D port, so flatten all lanes into - // single wide 1D array. - // TODO: assumes 64-bit address. - val cycle = Input(UInt(64.W)) - val address = Output(UInt((addrW * numLanes).W)) - val is_store = Output(UInt(numLanes.W)) - val size = Output(UInt((sizeW * numLanes).W)) - val data = Output(UInt((dataW * numLanes).W)) - val finished = Output(Bool()) - } + val trace_read = + new Bundle { // can't use HasTraceLine because this doesn't have source + val ready = Input(Bool()) + val valid = Output(UInt(numLanes.W)) + // Chisel can't interface with Verilog 2D port, so flatten all lanes into + // single wide 1D array. + // TODO: assumes 64-bit address. + val cycle = Input(UInt(64.W)) + val address = Output(UInt((addrW * numLanes).W)) + val is_store = Output(UInt(numLanes.W)) + val size = Output(UInt((sizeW * numLanes).W)) + val data = Output(UInt((dataW * numLanes).W)) + val finished = Output(Bool()) + } }) addResource("/vsrc/SimMemTrace.v") @@ -1443,11 +1543,11 @@ class MemTraceLogger( // This assert only holds true for PutFullData and not PutPartialData, // where HIGH bits in the mask may not be contiguous. - when (tlIn.a.valid) { + when(tlIn.a.valid) { assert( PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), "mask HIGH popcount do not match the TL size. " + - "Partial masks are not allowed for PutFull" + "Partial masks are not allowed for PutFull" ) } val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) @@ -1476,17 +1576,25 @@ class MemTraceLogger( // stats val numReqsThisCycle = - laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 } + laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { + (v0, v1) => v0 + v1 + } val numRespsThisCycle = - laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 } + laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { + (v0, v1) => v0 + v1 + } val reqBytesThisCycle = - laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) => - b0 + b1 - } + laneReqs + .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) } + .reduce { (b0, b1) => + b0 + b1 + } val respBytesThisCycle = - laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) => - b0 + b1 - } + laneResps + .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) } + .reduce { (b0, b1) => + b0 + b1 + } numReqs := numReqs + numReqsThisCycle numResps := numResps + numRespsThisCycle reqBytes := reqBytes + reqBytesThisCycle @@ -1496,7 +1604,10 @@ class MemTraceLogger( // // This is a clunky workaround of the fact that Chisel doesn't allow partial // assignment to a bitfield range of a wide signal. - def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { + def flattenTrace( + simIO: Bundle with HasTraceLine, + perLane: Vec[TraceLine] + ) = { // these will get optimized out val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid))) val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source))) @@ -1592,8 +1703,14 @@ object TLPrintf { tlData: UInt, reqData: UInt ) = { - printf(s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d", - source, address, size, mask, is_store) + printf( + s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d", + source, + address, + size, + mask, + is_store + ) when(is_store) { printf(", tlData=%x, reqData=%x", tlData, reqData) } @@ -1604,7 +1721,7 @@ object TLPrintf { // Synthesizable unit tests class DummyDriver(config: CoalescerConfig)(implicit p: Parameters) - extends LazyModule { + extends LazyModule { val laneNodes = Seq.tabulate(config.numLanes) { i => val clientParam = Seq( TLMasterParameters.v1( @@ -1640,7 +1757,10 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig) // generate dummy traffic to coalescer to prevent it from being optimized // out during synthesis val address = Wire(UInt(config.addressWidth.W)) - address := Cat((finishCounter + (lane.U % 3.U)), 0.U(config.wordSizeWidth.W)) + address := Cat( + (finishCounter + (lane.U % 3.U)), + 0.U(config.wordSizeWidth.W) + ) val (tl, edge) = node.out(0) val (legal, bits) = edge.Put( fromSource = sourceIdCounter, @@ -1657,11 +1777,13 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig) tl.e.valid := false.B } - val dataSum = outer.laneNodes.map { node => - val tl = node.out(0)._1 - val data = Mux(tl.d.valid, tl.d.bits.data, 0.U) - data - }.reduce (_ +& _) + val dataSum = outer.laneNodes + .map { node => + val tl = node.out(0)._1 + val data = Mux(tl.d.valid, tl.d.bits.data, 0.U) + data + } + .reduce(_ +& _) // this doesn't make much sense, but it prevents the entire uncoalescer from // being optimized away finishCounter := finishCounter + dataSum @@ -1680,8 +1802,10 @@ class DummyCoalescer(implicit p: Parameters) extends LazyModule { // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. - new TLRAM(address = AddressSet(0x0000, 0xffffff), - beatBytes = (1 << config.dataBusWidth)) + new TLRAM( + address = AddressSet(0x0000, 0xffffff), + beatBytes = (1 << config.dataBusWidth) + ) ) ) @@ -1704,7 +1828,8 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) } // tracedriver --> coalescer --> tracelogger --> tlram -class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { +class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) + extends LazyModule { val numLanes = p(SIMTCoreKey).get.nLanes val config = defaultConfig.copy(numLanes = numLanes) @@ -1713,14 +1838,18 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz new MemTraceLogger(numLanes, filename, loggerName = "coreside") ) val coal = LazyModule(new CoalescingUnit(config)) - val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")) + val memSideLogger = LazyModule( + new MemTraceLogger(numLanes + 1, filename, loggerName = "memside") + ) val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. - new TLRAM(address = AddressSet(0x0000, 0xffffff), - beatBytes = (1 << config.dataBusWidth)) + new TLRAM( + address = AddressSet(0x0000, 0xffffff), + beatBytes = (1 << config.dataBusWidth) + ) ) ) @@ -1751,8 +1880,9 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz } } -class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters) - extends UnitTest(timeout) { +class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit + p: Parameters +) extends UnitTest(timeout) { val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module) dut.io.start := io.start io.finished := dut.io.finished @@ -1770,8 +1900,10 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. - new TLRAM(address = AddressSet(0x0000, 0xffffff), - beatBytes = (1 << defaultConfig.dataBusWidth)) + new TLRAM( + address = AddressSet(0x0000, 0xffffff), + beatBytes = (1 << defaultConfig.dataBusWidth) + ) ) ) @@ -1785,13 +1917,13 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { } } -class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { +class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) + extends UnitTest(timeout) { val dut = Module(LazyModule(new TLRAMCoalescer).module) dut.io.start := io.start io.finished := dut.io.finished } - //////////// //////////// //////////// @@ -1941,11 +2073,3 @@ class CoalescerXbarImpl(outer: CoalescerXbar, } - - - - - - - -