// See LICENSE.SiFive for license details. package freechips.rocketchip.tilelink import chisel3._ import chisel3.util._ import org.chipsalliance.cde.config.{Parameters, Field} import freechips.rocketchip.diplomacy._ // import freechips.rocketchip.devices.tilelink.TLTestRAM import freechips.rocketchip.util.MultiPortQueue import freechips.rocketchip.unittest._ // TODO: find better place for these case class SIMTCoreParams(nLanes: Int = 4) case class MemtraceCoreParams(tracefilename: String = "undefined", traceHasSource: Boolean = false) case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/) case object MemtraceCoreKey extends Field[Option[MemtraceCoreParams]](None /*default*/) case object CoalescerKey extends Field[Option[CoalescerConfig]](None /*default*/) trait InFlightTableSizeEnum extends ChiselEnum { val INVALID: Type val FOUR: Type def logSizeToEnum(x: UInt): Type def enumToLogSize(x: Type): UInt } object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum { val INVALID = Value(0.U) val FOUR = Value(1.U) def logSizeToEnum(x: UInt): Type = { MuxCase(INVALID, Seq( (x === 2.U) -> FOUR )) } def enumToLogSize(x: Type): UInt = { MuxCase(0.U, Seq( (x === FOUR) -> 2.U )) } } // Mapping to reference model param names // numLanes: Int, <-> config.NUM_LANES // numPerLaneReqs: Int, <-> config.DEPTH // sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS) // sizeWidth: Int, <-> config.sizeEnum.width // coalDataWidth: Int, <-> (1 << config.MAX_SIZE) // numInflightCoalRequests: Int <-> config.NUM_NEW_IDS case class CoalescerConfig( enable: Boolean, // globally enable or disable coalescing numLanes: Int, // number of lanes (or threads) in a warp queueDepth: Int, // request window per lane waitTimeout: Int, // max cycles to wait before forced fifo dequeue, per lane addressWidth: Int, // assume <= 32 dataBusWidth: Int, // memory-side downstream TileLink data bus size // this has to be at least larger than word size for // the coalescer to perform well // watermark = 2, // minimum buffer occupancy to start coalescing wordSizeInBytes: Int, // 32-bit system numOldSrcIds: Int, // num of outstanding requests per lane, from processor numNewSrcIds: Int, // num of outstanding coalesced requests respQueueDepth: Int, // depth of the response fifo queues coalLogSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers // each size is log(byteSize) sizeEnum: InFlightTableSizeEnum, numCoalReqs: Int, // total number of coalesced requests we can generate in one cycle numArbiterOutputPorts: Int, // total of output ports the arbiter will arbitrate into. // this has to match downstream cache's configuration bankStrideInBytes: Int // cache line strides across the different banks ) { // maximum coalesced size def maxCoalLogSize: Int = coalLogSizes.max def wordSizeWidth: Int = { val w = log2Ceil(wordSizeInBytes) require(wordSizeInBytes == 1 << w, s"wordSizeInBytes (${wordSizeInBytes}) is not power of two") w } } object defaultConfig extends CoalescerConfig( enable = true, numLanes = 4, queueDepth = 1, waitTimeout = 8, addressWidth = 24, dataBusWidth = 4, // 2^3=8 bytes, 64 bit bus // watermark = 2, wordSizeInBytes = 4, // when attaching to SoC, 16 source IDs are not enough due to longer latency numOldSrcIds = 8, numNewSrcIds = 8, respQueueDepth = 4, coalLogSizes = Seq(4), sizeEnum = DefaultInFlightTableSizeEnum, numCoalReqs = 1, numArbiterOutputPorts = 4, bankStrideInBytes = 64 // Current L2 is strided by 512 bits ) class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends LazyModule { // Nexus node that captures the incoming TL requests, rewrites coalescable requests, // and arbitrates between non-coalesced and coalesced requests to a fix number of outputs // before sending it out to memory. This node is what's visible to upstream and downstream nodes. // WIP: // val node = TLNexusNode( // clientFn = c => c.head, // managerFn = m => m.head // assuming arbiter generated ids are distinct between edges // ) // node.in.map(_._2).foreach(edge => require(edge.manager.beatBytes == config.wordSizeInBytes, // s"input edges into coalescer node does not have beatBytes = ${config.wordSizeInBytes}")) // node.out.map(_._2).foreach(edge => require(edge.manager.beatBytes == config.maxCoalLogSize, // s"output edges into coalescer node does not have beatBytes = ${config.maxCoalLogSize}")) val aggregateNode = TLIdentityNode() val cpuNode = TLIdentityNode() // Number of maximum in-flight coalesced requests. The upper bound of this // value would be the sourceId range of a single lane. val numInflightCoalRequests = config.numNewSrcIds // Master node that actually generates coalesced requests. protected val coalParam = Seq( TLMasterParameters.v1( name = "CoalescerNode", sourceId = IdRange(0, numInflightCoalRequests) ) ) val coalescerNode = TLClientNode( Seq(TLMasterPortParameters.v1(coalParam)) ) // merge coalescerNode and cpuNode aggregateNode :=* coalescerNode aggregateNode :=* TLWidthWidget(config.wordSizeInBytes) :=* cpuNode lazy val module = new CoalescingUnitImp(this, config) } // Protocol-agnostic bundles that represent a request and a response to the // coalescer. class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int) extends Bundle { require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8") val op = UInt(1.W) // 0=READ 1=WRITE val address = UInt(addressWidth.W) val size = UInt(sizeWidth.W) val source = UInt(sourceWidth.W) val mask = UInt((dataWidth / 8).W) // write only val data = UInt(dataWidth.W) // write only def toTLA(edgeOut: TLEdgeOut): TLBundleA = { val (plegal, pbits) = edgeOut.Put( fromSource = this.source, toAddress = this.address, lgSize = this.size, data = this.data ) val (glegal, gbits) = edgeOut.Get( fromSource = this.source, toAddress = this.address, lgSize = this.size ) val legal = Mux(this.op.asBool, plegal, glegal) val bits = Mux(this.op.asBool, pbits, gbits) // FIXME: this needs to check valid bit as well // assert(legal, "unhandled illegal TL req gen") bits } } case class NonCoalescedRequest(config: CoalescerConfig) extends Request( sourceWidth = log2Ceil(config.numOldSrcIds), sizeWidth = config.wordSizeWidth, addressWidth = config.addressWidth, dataWidth = config.wordSizeInBytes * 8 ) case class CoalescedRequest(config: CoalescerConfig) extends Request( sourceWidth = log2Ceil(config.numNewSrcIds), sizeWidth = log2Ceil(config.maxCoalLogSize + 1), addressWidth = config.addressWidth, dataWidth = (8 * (1 << config.maxCoalLogSize)) ) class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int) extends Bundle { require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8") val op = UInt(1.W) // 0=READ 1=WRITE val size = UInt(sizeWidth.W) val source = UInt(sourceWidth.W) val data = UInt(dataWidth.W) // read only val error = Bool() def toTLD(edgeIn: TLEdgeIn): TLBundleD = { val apBits = edgeIn.AccessAck( toSource = this.source, lgSize = this.size ) val agBits = edgeIn.AccessAck( toSource = this.source, lgSize = this.size, data = this.data ) Mux(this.op.asBool, apBits, agBits) } def fromTLD(bundle: TLBundleD): Unit = { this.source := bundle.source this.op := TLUtils.DOpcodeIsStore(bundle.opcode) this.size := bundle.size this.data := bundle.data this.error := bundle.denied } } case class NonCoalescedResponse(config: CoalescerConfig) extends Response( sourceWidth = log2Ceil(config.numOldSrcIds), sizeWidth = config.wordSizeWidth, dataWidth = config.wordSizeInBytes * 8 ) case class CoalescedResponse(config: CoalescerConfig) extends Response( sourceWidth = log2Ceil(config.numNewSrcIds), sizeWidth = log2Ceil(config.maxCoalLogSize), dataWidth = (8 * (1 << config.maxCoalLogSize)) ) // If `ignoreInUse`, just keep giving out new IDs without checking if it is in // use. class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module { val io = IO(new Bundle { val gen = Input(Bool()) val reclaim = Input(Valid(UInt(sourceWidth.W))) val id = Output(Valid(UInt(sourceWidth.W))) }) val head = RegInit(UInt(sourceWidth.W), 0.U) head := Mux(io.gen, head + 1.U, head) val numSourceId = 1 << sourceWidth // true: in use, false: available val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W))) when(reset.asBool) { (0 until numSourceId).foreach { i => occupancyTable(i).valid := false.B } } io.id.valid := (if (ignoreInUse) true.B else !occupancyTable(head).valid) io.id.bits := head when(io.gen && io.id.valid /* fire */ ) { occupancyTable(io.id.bits).valid := true.B // mark in use } when(io.reclaim.valid) { occupancyTable(io.reclaim.bits).valid := false.B // mark freed } } class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) extends Module { val io = IO(new Bundle { val queue = new Bundle { val enq = Vec(config.numLanes, DeqIO(gen.cloneType)) val deq = Vec(config.numLanes, EnqIO(gen.cloneType)) } val invalidate = Input(Valid(Vec(config.numLanes, UInt(entries.W)))) val coalescable = Input(Vec(config.numLanes, Bool())) val mask = Output(Vec(config.numLanes, UInt(entries.W))) val elts = Output(Vec(config.numLanes, Vec(entries, gen))) }) // val eltPrototype = Wire(Valid(gen)) // eltPrototype.bits := DontCare // eltPrototype.valid := false.B val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen)))) val writePtr = RegInit( VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))) ) val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B))) private def resetElts = { elts.foreach { laneQ => laneQ.foreach { entry => entry.valid := false.B entry.bits := DontCare } } } when(reset.asBool) { resetElts } val controlSignals = Wire(Vec(config.numLanes, new Bundle { val shift = Bool() val full = Bool() val empty = Bool() })) // io.coalescable will first turn on for all coalescable chunks, and turn off // incrementally as time goes on. Therefore, when io.coalescable is all // turned off, that means we have processed all coalescable chunks at the // current cycle. // // shift hint is when the heads have no more coalescable left this or next cycle val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))) .map { case (c, inv) => c && !(io.invalidate.valid && inv) } .reduce(_ || _) val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _) // valid && !fire means we enable enqueueing to a full queue, provided the // arbiter is taking away all remaining valid queue heads in the next cycle so // that we make space for the entire next warp. val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) for (i <- 0 until config.numLanes) { val enq = io.queue.enq(i) val deq = io.queue.deq(i) val ctrl = controlSignals(i) ctrl.full := writePtr(i) === entries.U ctrl.empty := writePtr(i) === 0.U // shift when no outstanding dequeue, no more coalescable chunks, and not empty ctrl.shift := !syncedDeqValidNextCycle && shiftHint && !ctrl.empty // dequeue is valid when: // head entry is valid, has not been processed by downstream, and is not coalescable deq.bits := elts.map(_.head.bits)(i) deq.valid := elts.map(_.head.valid)(i) && !deqDone(i) && !io.coalescable(i) // can take new entries if not empty, or if full but shifting enq.ready := (!ctrl.full) || ctrl.shift when(ctrl.shift) { // shift, invalidate tail, invalidate coalesced requests elts(i).zipWithIndex.foreach { case (elt, j) => if (j == entries - 1) { // tail elt.valid := false.B } else { elt.bits := elts(i)(j + 1).bits elt.valid := elts(i)( j + 1 ).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1)) } } // reset dequeue mask when new entries are shifted in deqDone(i) := false.B // enqueue when(enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire elts(i)(writePtr(i) - 1.U).bits := enq.bits elts(i)(writePtr(i) - 1.U).valid := enq.valid }.otherwise { writePtr(i) := writePtr(i) - 1.U } }.otherwise { // invalidate coalesced requests when(io.invalidate.valid) { (elts(i) zip io.invalidate.bits(i).asBools).map { case (elt, inv) => elt.valid := elt.valid && !inv } } // enqueue when(enq.ready && syncedEnqValid) { elts(i)(writePtr(i)).bits := enq.bits elts(i)(writePtr(i)).valid := enq.valid writePtr(i) := writePtr(i) + 1.U } deqDone(i) := deqDone(i) || deq.fire } } // When doing spatial-only coalescing, queues should never drift from each // other, i.e. the queue heads should always contain mem requests from the // same instruction. val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) && writePtr.map(_ === writePtr.head).reduce(_ && _) assert(queueInSync, "shift queue lanes are not in sync") io.mask := elts.map(x => VecInit(x.map(_.valid)).asUInt) io.elts := elts.map(x => VecInit(x.map(_.bits))) } // Software model: coalescer.py class MonoCoalescer( config: CoalescerConfig, coalLogSize: Int, queueT: CoalShiftQueue[NonCoalescedRequest] ) extends Module { val io = IO(new Bundle { val window = Input(queueT.io.cloneType) val results = Output(new Bundle { val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W)) val baseAddr = Output(UInt(config.addressWidth.W)) val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W))) // number of entries matched with this leader lane's head. // maximum is numLanes * queueDepth val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W)) val coverageHits = Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W)) val canCoalesce = Output(Vec(config.numLanes, Bool())) }) }) io := DontCare // Combinational logic to drive output from window contents. // The leader lanes only compare their heads against all entries of the // follower lanes. val leaders = io.window.elts.map(_.head) val leadersValid = io.window.mask.map(_.asBools.head) def printQueueHeads = { leaders.zipWithIndex.foreach { case (head, i) => printf( s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n", leadersValid(i), head.source, head.address ) } } // when (leadersValid.reduce(_ || _)) { // printQueueHeads // } val size = coalLogSize // NOTE: be careful with Scala integer overflow when addressWidth >= 32 val addrMask = (((1L << config.addressWidth) - 1) - ((1 << size) - 1)).U def canMatch(req0: Request, req0v: Bool, req1: Request, req1v: Bool): Bool = { (req0.op === req1.op) && (req0v && req1v) && ((req0.address & this.addrMask) === (req1.address & this.addrMask)) } // Gives a 2-D table of Bools representing match at every queue entry, // for each lane (so 3-D in total). // dimensions: (leader lane, follower lane, follower entry) val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) => (io.window.elts zip io.window.mask).map { case (followers, followerValids) => // compare leader's head against follower's every queue entry (followers zip followerValids.asBools).map { case (follower, followerValid) => canMatch(follower, followerValid, leader, leaderValid) // FIXME: disabling halving optimization because it does not give the // correct per-lane coalescable indication to the shift queue // // match leader to only followers at lanes >= leader idx // // this halves the number of comparators // if (followerIndex < leaderIndex) false.B // else canMatch(follower, followerValid, leader, leaderValid) } } } val matchCounts = matchTablePerLane.map(table => table .map(PopCount(_)) // sum up each column .reduce(_ +& _) ) val canCoalesce = matchCounts.map(_ > 1.U) // Elect the leader that has the most match counts. // TODO: potentially expensive: magnitude comparator def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = { matchCounts.zipWithIndex .map { case (c, i) => (c, i.U) } .reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j)) } ._2 } // Elect leader by choosing the smallest-index lane that has a valid // match, i.e. using priority encoder. def chooseLeaderPriorityEncoder(matchCounts: Seq[UInt]): UInt = { PriorityEncoder(matchCounts.map(_ > 1.U)) } val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts) val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux // matchTable for the chosen lane, but each column converted to bitflags, // i.e. Vec[UInt] val chosenMatches = VecInit(matchTablePerLane.map { table => VecInit(table.map(VecInit(_).asUInt)) })(chosenLeaderIdx) val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx) // coverage calculation def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordSizeWidth) // 2-D table flattened to 1-D val offsets = io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address))) val valids = chosenMatches.flatMap(_.asBools) // indicates for each word in the coalesced chunk whether it is accessed by // any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four // words in the coalesced data coming back will be accessed by some request // and we've reached 100% bandwidth utilization. val hits = Seq.tabulate(1 << (size - config.wordSizeWidth)) { target => (offsets zip valids) .map { case (offset, valid) => valid && (offset === target.U) } .reduce(_ || _) } // debug prints when(leadersValid.reduce(_ || _)) { matchCounts.zipWithIndex.foreach { case (count, i) => printf(s"lane[${i}] matchCount = %d\n", count); } printf("chosenLeader = lane %d\n", chosenLeaderIdx) printf("chosenLeader matches = [ ") chosenMatches.foreach { m => printf("%d ", m) } printf("]\n") printf("chosenMatchCount = %d\n", chosenMatchCount) printf("hits = [ ") hits.foreach { m => printf("%d ", m) } printf("]\n") } io.results.leaderIdx := chosenLeaderIdx io.results.baseAddr := chosenLeader.address & addrMask io.results.matchOH := chosenMatches io.results.matchCount := chosenMatchCount io.results.coverageHits := PopCount(hits) io.results.canCoalesce := canCoalesce } // Combinational logic that generates a coalesced request given a request // window, and a selection of possible coalesced sizes. May utilize multiple // MonoCoalescers and apply size-choosing policy to determine the final // coalesced request out of all possible combinations. // // Software model: coalescer.py class MultiCoalescer( config: CoalescerConfig, queueT: CoalShiftQueue[NonCoalescedRequest], coalReqT: Request, ) extends Module { val invalidateT = Valid(Vec(config.numLanes, UInt(config.queueDepth.W))) val io = IO(new Bundle { // coalescing window, connected to the contents of the request queues val window = Input(queueT.io.cloneType) // generated coalesced request val coalReq = DecoupledIO(coalReqT.cloneType) // invalidate signals going into each request queue's head. Lanes with // high invalidate bits are what became coalesced into the new request. val invalidate = Output(invalidateT) // whether a lane is coalescable. This is used to output non-coalescable // lanes to the arbiter so they can be flushed to downstream. val coalescable = Output(Vec(config.numLanes, Bool())) }) val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(config, size, queueT)) ) coalescers.foreach(_.io.window := io.window) def normalize(valPerSize: Seq[UInt]): Seq[UInt] = { (valPerSize zip config.coalLogSizes).map { case (hits, size) => (hits << (config.maxCoalLogSize - size).U).asUInt } } def argMax(x: Seq[UInt]): UInt = { x.zipWithIndex.map { case (a, b) => (a, b.U) }.reduce[(UInt, UInt)] { case ((a, i), (b, j)) => (Mux(a > b, a, b), Mux(a > b, i, j)) // > instead of >= here; want to use largest size }._2 } // normalize to maximum coalescing size so that we can do fair comparisons // between coalescing results of different sizes val normalizedMatches = normalize(coalescers.map(_.io.results.matchCount)) val normalizedHits = normalize(coalescers.map(_.io.results.coverageHits)) val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W)) val chosenValid = Wire(Bool()) // minimum 25% coverage val minCoverage = 1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2)) // when(normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) { // chosenSizeIdx := argMax(normalizedHits) // chosenValid := true.B // printf("coalescing success by coverage policy\n") // }.else when(normalizedMatches.map(_ > 1.U).reduce(_ || _)) { chosenSizeIdx := argMax(normalizedMatches) chosenValid := true.B printf("coalescing success by matches policy\n") }.otherwise { chosenSizeIdx := DontCare chosenValid := false.B } def debugPolicyPrint() = { printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount) printf("normalizedMatches[0]=%d\n", normalizedMatches(0)) printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits) printf("normalizedHits[0]=%d\n", normalizedHits(0)) printf("minCoverage=%d\n", minCoverage.U) } // create coalesced request val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenSizeIdx) val chosenSize = VecInit(coalescers.map(_.size.U))(chosenSizeIdx) // flatten requests and matches val flatReqs = io.window.elts.flatten val flatMatches = chosenBundle.matchOH.flatMap(_.asBools) // check for word alignment in addresses assert( io.window.elts .flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U)) .zip(io.window.mask.flatMap(_.asBools)) .map { case (aligned, valid) => (!valid) || aligned } .reduce(_ || _), "one or more addresses used for coalescing is not word-aligned" ) // note: this is word-level coalescing. if finer granularity is needed, need to modify code val numWords = (1.U << (chosenSize - config.wordSizeWidth.U)).asUInt val maxWords = 1 << (config.maxCoalLogSize - config.wordSizeWidth) val addrMask = Wire(UInt(config.maxCoalLogSize.W)) addrMask := (1.U << chosenSize).asUInt - 1.U val data = Wire(Vec(maxWords, UInt((config.wordSizeInBytes * 8).W))) val mask = Wire(Vec(maxWords, UInt(config.wordSizeInBytes.W))) for (i <- 0 until maxWords) { val sel = flatReqs.zip(flatMatches).map { case (req, m) => // note: ANDing against addrMask is to conform to active byte lanes requirements // if aligning to LSB suffices, we should add the bitwise AND back m && ((req.address( config.maxCoalLogSize - 1, config.wordSizeWidth ) /* & addrMask*/ ) === i.U) } // TODO: SW uses priority encoder, not sure about behavior of MuxCase data(i) := MuxCase( DontCare, flatReqs.zip(sel).map { case (req, s) => s -> req.data } ) mask(i) := MuxCase( 0.U, flatReqs.zip(sel).map { case (req, s) => s -> req.mask } ) } val coalesceValid = chosenValid // setting source is deferred, because in order to do proper source ID // generation we also have to look at the responses coming back, which // is easier to do at the toplevel. io.coalReq.bits.source := DontCare io.coalReq.bits.mask := mask.asUInt io.coalReq.bits.data := data.asUInt io.coalReq.bits.size := chosenSize io.coalReq.bits.address := chosenBundle.baseAddr io.coalReq.bits.op := io.window.elts(chosenBundle.leaderIdx).head.op io.coalReq.valid := coalesceValid io.invalidate.bits := chosenBundle.matchOH io.invalidate.valid := io.coalReq.fire // invalidate only when fire io.coalescable := coalescers .map(_.io.results.canCoalesce.asUInt) .reduce(_ | _) .asBools dontTouch(io.invalidate) // debug def disable = { io.coalReq.valid := false.B io.invalidate.valid := false.B io.coalescable.foreach { _ := false.B } } if (!config.enable) disable } class CoalescerSourceGen( config: CoalescerConfig, coalReqT: CoalescedRequest, respT: TLBundleD ) extends Module { val io = IO(new Bundle { val inReq = Flipped(Decoupled(coalReqT.cloneType)) val outReq = Decoupled(coalReqT.cloneType) val inResp = Flipped(Decoupled(respT.cloneType)) }) val sourceGen = Module( new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds), ignoreInUse = false) ) sourceGen.io.gen := io.inReq.fire // use up a source ID only when request is created sourceGen.io.reclaim.valid := io.inResp.valid sourceGen.io.reclaim.bits := io.inResp.bits.source io.inResp.ready := true.B // should be always ready to reclaim old ID // TODO: make sourceGen.io.reclaim Decoupled? io.outReq <> io.inReq // overwrite bits affected by sourcegen backpressure io.outReq.valid := io.inReq.valid && sourceGen.io.id.valid io.outReq.bits.source := sourceGen.io.id.bits } class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) { require( outer.cpuNode.in.length == config.numLanes, s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " + s"config.numLanes (${config.numLanes})" ) require( outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds), s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " + s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})" ) require( outer.cpuNode.in.head._1.params.addressBits == config.addressWidth, s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " + s"mismatch with config.addressWidth (${config.addressWidth})" ) require( config.maxCoalLogSize <= config.dataBusWidth, "multi-beat coalesced reads/writes are currently not supported" ) val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits val nonCoalReqT = new NonCoalescedRequest(config) val reqQueues = Module( new CoalShiftQueue(nonCoalReqT, config.queueDepth, config) ) val coalReqT = new CoalescedRequest(config) val coalescer = Module(new MultiCoalescer(config, reqQueues, coalReqT)) coalescer.io.window := reqQueues.io reqQueues.io.coalescable := coalescer.io.coalescable reqQueues.io.invalidate := coalescer.io.invalidate val uncoalescer = Module(new Uncoalescer(config, nonCoalReqT, coalReqT)) // =========================================================================== // Request flow // =========================================================================== // // Override IdentityNode implementation so that we can instantiate // queues between input and output edges to buffer requests and responses. // See IdentityNode definition in `diplomacy/Nodes.scala`. // (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach { case (((tlIn, _), (tlOut, edgeOut)), lane) => // Request queue val req = Wire(nonCoalReqT) req.op := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) req.source := tlIn.a.bits.source req.address := tlIn.a.bits.address req.data := tlIn.a.bits.data req.size := tlIn.a.bits.size // FIXME: req.data is still containing TL-aligned data. This is fine if // we're simply passing through this data out the other end, but not if // the outgoing TL edge (tlOut) has different data width from the incoming // edge (tlIn). Possible TODO to only store the relevant portion of the // data, at the cost of re-aligning at the outgoing end. req.mask := tlIn.a.bits.mask val enq = reqQueues.io.queue.enq(lane) val deq = reqQueues.io.queue.deq(lane) enq.valid := tlIn.a.valid enq.bits := req // Respect arbiter and uncoalescer backpressure deq.ready := tlOut.a.ready && uncoalescer.io.coalReq.ready // Stall upstream core or memtrace driver when shiftqueue is not ready tlIn.a.ready := enq.ready tlOut.a.valid := deq.valid tlOut.a.bits := deq.bits.toTLA(edgeOut) // debug // when (tlIn.a.valid) { // TLPrintf(s"tlIn(${lane}).a", // tlIn.a.bits.address, // tlIn.a.bits.size, // tlIn.a.bits.mask, // TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode), // tlIn.a.bits.data, // 0.U // ) // } // when (tlOut.a.valid) { // TLPrintf(s"tlOut(${lane}).a", // tlOut.a.bits.address, // tlOut.a.bits.size, // tlOut.a.bits.mask, // TLUtils.AOpcodeIsStore(tlOut.a.bits.opcode), // tlOut.a.bits.data, // 0.U // ) // } } val (tlCoal, edgeCoal) = outer.coalescerNode.out.head // The request coming out of MultiCoalescer still needs to go through source // ID generation. // We pull the sourcegen part out of MultiCoalescer to a separate Module to // reduce IO bloat in the coalescer and top-level clutter. // // The overall flow looks like: // ┌────────────────┐ ┌─────────────────────┐ ┌────────────────────┐ // │ CoalShiftQueue ├─┤ Mono/MultiCoalescer ├─┤ CoalescerSourceGen ├── TileLink req // └────────────────┘ └─────────────────────┘ └────────────────────┘ // ^ // ┌────────────┐ ┌─────────────┐ │ // │ RespQueues ├─┤ Uncoalescer ├──┴────── TileLink resp // └────────────┘ └─────────────┘ // val coalSourceGen = Module(new CoalescerSourceGen(config, coalReqT, tlCoal.d.bits)) coalSourceGen.io.inReq <> coalescer.io.coalReq coalSourceGen.io.inResp <> tlCoal.d // This is the final coalesced request. val coalReq = coalSourceGen.io.outReq dontTouch(coalReq) tlCoal.a.valid := coalReq.valid tlCoal.a.bits := coalReq.bits.toTLA(edgeCoal) coalescer.io.coalReq.ready := tlCoal.a.ready tlCoal.b.ready := true.B tlCoal.c.valid := false.B // tlCoal.d.ready should be connected to uncoalescer's ready, done below. tlCoal.e.valid := false.B require( tlCoal.params.sourceBits == log2Ceil(config.numNewSrcIds), s"tlCoal param `sourceBits` (${tlCoal.params.sourceBits}) mismatches coalescer constant" + s" (${log2Ceil(config.numNewSrcIds)})" ) require( tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8, s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant" + s" (${(1 << config.dataBusWidth) * 8})" ) // =========================================================================== // Response flow // =========================================================================== // // Connect uncoalescer output and noncoalesced response ports to the response // queues. // The maximum number of requests from a single lane that can go into a // coalesced request. val numPerLaneReqs = config.queueDepth // FIXME: no need to contain maxCoalLogSize data val respQueueEntryT = new Response( oldSourceWidth, log2Ceil(config.maxCoalLogSize), (1 << config.maxCoalLogSize) * 8 ) val respQueues = Seq.tabulate(config.numLanes) { _ => Module( new MultiPortQueue( respQueueEntryT, // enq_lanes = 1 + M, where 1 is the response for the original per-lane // requests that didn't get coalesced, and M is the maximum number of // single-lane requests that can go into a coalesced request. // (`numPerLaneReqs`). // TODO: potentially expensive, because this generates more FFs. // Rather than enqueueing all responses in a single cycle, consider // enqueueing one by one (at the cost of possibly stalling downstream). 1 + numPerLaneReqs, // deq_lanes = 1 because we're serializing all responses to 1 port that // goes back to the core. 1, // lanes. Has to be at least max(enq_lanes, deq_lanes) 1 + numPerLaneReqs, // Depth of each lane queue. // XXX queue depth is set to an arbitrarily high value that doesn't // make queue block up in the middle of the simulation. Ideally there // should be a more logical way to set this, or we should handle // response queue blocking. config.respQueueDepth ) ) } val respQueueNoncoalPort = 0 val respQueueUncoalPortOffset = 1 (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach { case (((tlIn, edgeIn), (tlOut, _)), lane) => // Response queue // // This queue will serialize non-coalesced responses along with // coalesced responses and serve them back to the core side. val respQueue = respQueues(lane) val resp = Wire(respQueueEntryT) resp.fromTLD(tlOut.d.bits) // Queue up responses that didn't get coalesced originally ("noncoalesced" responses). // Coalesced (but uncoalesced back) responses will also be enqueued into the same queue. assert( respQueue.io.enq(respQueueNoncoalPort).ready, "respQueue: enq port for noncoalesced response is blocked" ) respQueue.io.enq(respQueueNoncoalPort).valid := tlOut.d.valid respQueue.io.enq(respQueueNoncoalPort).bits := resp // TODO: deq.ready should respect upstream ready respQueue.io.deq(respQueueNoncoalPort).ready := true.B tlIn.d.valid := respQueue.io.deq(respQueueNoncoalPort).valid tlIn.d.bits := respQueue.io.deq(respQueueNoncoalPort).bits.toTLD(edgeIn) // Debug only val inflightCounter = RegInit(UInt(32.W), 0.U) when(tlOut.a.valid) { // don't inc/dec on simultaneous req/resp when(!tlOut.d.valid) { inflightCounter := inflightCounter + 1.U } }.elsewhen(tlOut.d.valid) { inflightCounter := inflightCounter - 1.U } dontTouch(inflightCounter) dontTouch(tlIn.a) dontTouch(tlIn.d) dontTouch(tlOut.a) dontTouch(tlOut.d) } // Uncoalescer input // // connect coalesced request to be recorded in the uncoalescer table uncoalescer.io.coalReq <> coalReq uncoalescer.io.invalidate := coalescer.io.invalidate uncoalescer.io.windowElts := reqQueues.io.elts // coalesced response to be used to look up the uncoalescer table uncoalescer.io.coalResp.valid := tlCoal.d.valid uncoalescer.io.coalResp.bits.fromTLD(tlCoal.d.bits) // Uncoalescer output // // Connect uncoalescer results back into response queue (respQueues zip uncoalescer.io.respQueueIO).foreach { case (q, uncoalEnqs) => require(q.io.enq.length == config.queueDepth + respQueueUncoalPortOffset, s"wrong number of enq ports for MultiPort response queue") // slice the ports reserved for uncoalesced response val qUncoalEnqs = q.io.enq.slice(respQueueUncoalPortOffset, q.io.enq.length) (qUncoalEnqs zip uncoalEnqs).foreach { case (enq, uncoalEnq) => { enq <> uncoalEnq } } } // uncoalescer backpressure tlCoal.d.ready := uncoalescer.io.coalResp.ready // Debug dontTouch(coalescer.io.coalReq) val coalRespData = tlCoal.d.bits.data dontTouch(coalRespData) dontTouch(tlCoal.a) dontTouch(tlCoal.d) } class Uncoalescer( config: CoalescerConfig, nonCoalReqT: NonCoalescedRequest, coalReqT: CoalescedRequest, ) extends Module { val inflightTable = Module(new InflightCoalReqTable(config)) val io = IO(new Bundle { // generated coalesced request, connected to the output of the coalescer. val coalReq = Flipped(DecoupledIO(coalReqT.cloneType)) // invalidate signal coming out of coalescer. val invalidate = Input(Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))) // coalescing window, connected to the contents of the request queues. // Uncoalescer looks at the queue entries that got coalesced into `coalReq` // in order to record which lanes this coalReq originally came from. // We only care about window.elts because the coalescer would have made // sure it only looked at the valid entries. // TODO: duplicate type construction val windowElts = Input(Vec(config.numLanes, Vec(config.queueDepth, nonCoalReqT))) val coalResp = Flipped(Decoupled(new CoalescedResponse(config))) val respQueueIO = Vec(config.numLanes, Vec(config.queueDepth, Decoupled(new NonCoalescedResponse(config))) ) }) // If inflight table is full, we cannot accept new requests to record them. // This might happen when we sent out many requests and exhausted all source // IDs, but they haven't come back yet. io.coalReq.ready := inflightTable.io.enq.ready // Construct a new entry for the inflight table using generated coalesced request def generateInflightTableEntry: InflightCoalReqTableEntry = { val newEntry = Wire(inflightTable.entryT) newEntry.source := io.coalReq.bits.source // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the // coalescer to every (numLanes * queueDepth) entry in the inflight table. (newEntry.lanes zip io.invalidate.bits).zipWithIndex .foreach { case ((laneEntry, laneInv), lane) => (laneEntry.reqs zip laneInv.asBools).zipWithIndex .foreach { case ((reqEntry, inv), i) => val req = io.windowElts(lane)(i) when((io.invalidate.valid && inv)) { printf( s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", req.source ) } reqEntry.valid := (io.invalidate.valid && inv) reqEntry.source := req.source reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordSizeWidth) reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size) // TODO: load/store op } } assert( !((io.coalReq.valid === true.B) && (io.coalResp.valid === true.B) && (newEntry.source === io.coalResp.bits.source)), "inflight table: enqueueing and looking up the same srcId at the same cycle is not handled" ) dontTouch(newEntry) newEntry } inflightTable.io.enq.valid := io.coalReq.valid inflightTable.io.enq.bits := generateInflightTableEntry // Look up the table with incoming coalesced responses inflightTable.io.lookup.ready := io.coalResp.valid inflightTable.io.lookupSourceId := io.coalResp.bits.source io.coalResp.ready := true.B // FIXME, see sw model implementation // Un-coalescing logic // def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, logSize: UInt): UInt = { assert(logSize === 2.U, "currently only supporting 4-byte accesses. TODO") // sizeInBits should be simulation-only construct val sizeInBits = ((1.U << logSize) << 3.U).asUInt assert( (dataWidth > 0).B && (dataWidth.U % sizeInBits === 0.U), s"coalesced data width ($dataWidth) not evenly divisible by core req size ($sizeInBits)" ) val numChunks = dataWidth / 32 val chunks = Wire(Vec(numChunks, UInt(32.W))) val offsets = (0 until numChunks) (chunks zip offsets).foreach { case (c, o) => // FIXME: whether to take the offset from MSB or LSB depends on // endianness. Right now we're assuming little endian c := data(32 * (o + 1) - 1, 32 * o) // If taking from MSB: // c := (data >> (dataWidth - (o + 1) * 32)) & sizeMask } chunks(offset) // MUX } // Un-coalesce responses back to individual lanes // Connect uncoalesced results back into each lane's response queue val foundRow = inflightTable.io.lookup.bits (foundRow.lanes zip io.respQueueIO).zipWithIndex.foreach { case ((foundLane, ioEnqs), lane) => foundLane.reqs.zipWithIndex.foreach { case (foundReq, depth) => val ioEnq = ioEnqs(depth) // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream // cache. This should ideally not happen though. assert( ioEnq.ready, s"respQueue: enq port for ${depth}-th uncoalesced response is blocked for lane ${lane}" ) // TODO: spatial-only coalescing: only looking at 0th srcId entry ioEnq.valid := false.B ioEnq.bits := DontCare // debug // when (resp.valid) { // printf(s"${i}-th uncoalesced response came back from lane ${lane}\n") // } // dontTouch(q.io.enq(respQueueCoalPortOffset)) when(inflightTable.io.lookup.valid && foundReq.valid) { ioEnq.valid := foundReq.valid ioEnq.bits.source := foundReq.source val logSize = foundRow.sizeEnumT.enumToLogSize(foundReq.sizeEnum) ioEnq.bits.size := logSize ioEnq.bits.data := getCoalescedDataChunk( io.coalResp.bits.data, io.coalResp.bits.data.getWidth, foundReq.offset, logSize ) } } } } // InflightCoalReqTable is a table structure that records // for each unanswered coalesced request which lane the request originated // from, what their original TileLink sourceId were, etc. We use this info to // split the coalesced response back to individual per-lane responses with the // right metadata. class InflightCoalReqTable(config: CoalescerConfig) extends Module { val offsetBits = config.maxCoalLogSize - config.wordSizeWidth // assumes word offset val entryT = new InflightCoalReqTableEntry( config.numLanes, config.queueDepth, log2Ceil(config.numOldSrcIds), config.maxCoalLogSize, config.sizeEnum ) val entries = config.numNewSrcIds val sourceWidth = log2Ceil(config.numOldSrcIds) println(s"=========== table sourceWidth: ${sourceWidth}") println(s"=========== table offsetBits: ${offsetBits}") println(s"=========== table sizeEnumBits: ${entryT.sizeEnumT.getWidth}") val io = IO(new Bundle { val enq = Flipped(Decoupled(entryT)) // TODO: return actual stuff val lookup = Decoupled(entryT) // TODO: put this inside decoupledIO val lookupSourceId = Input(UInt(sourceWidth.W)) }) val table = Mem( entries, new Bundle { val valid = Bool() val bits = entryT.cloneType } ) when(reset.asBool) { (0 until entries).foreach { i => table(i).valid := false.B table(i).bits.lanes.foreach { l => l.reqs.foreach { r => r.valid := false.B r.source := 0.U r.offset := 0.U r.sizeEnum := config.sizeEnum.INVALID } } } } val full = Wire(Bool()) full := (0 until entries).map(table(_).valid).reduce(_ && _) dontTouch(full) // Enqueue logic io.enq.ready := !full val enqFire = io.enq.ready && io.enq.valid when(enqFire) { // TODO: handle enqueueing and looking up the same entry in the same cycle? val entryToWrite = table(io.enq.bits.source) assert( !entryToWrite.valid, "tried to enqueue to an already occupied entry" ) entryToWrite.valid := true.B entryToWrite.bits := io.enq.bits } // Lookup logic io.lookup.valid := table(io.lookupSourceId).valid io.lookup.bits := table(io.lookupSourceId).bits // Dequeue as soon as lookup succeeds when(io.lookup.fire) { table(io.lookupSourceId).valid := false.B } dontTouch(io.lookup) } class InflightCoalReqTableEntry( val numLanes: Int, // Maximum number of requests from a single lane that can get coalesced into a single request val numPerLaneReqs: Int, val sourceWidth: Int, val offsetBits: Int, val sizeEnumT: InFlightTableSizeEnum ) extends Bundle { class PerCoreReq extends Bundle { val valid = Bool() // FIXME: delete this // FIXME: oldId and newId shares the same width val source = UInt(sourceWidth.W) val offset = UInt(offsetBits.W) val sizeEnum = sizeEnumT() } class PerLane extends Bundle { val reqs = Vec(numPerLaneReqs, new PerCoreReq) } // sourceId of the coalesced response that just came back. This will be the // key that queries the table. val source = UInt(sourceWidth.W) val lanes = Vec(numLanes, new PerLane) } object TLUtils { def AOpcodeIsStore(opcode: UInt): Bool = { // 0: PutFullData, 1: PutPartialData, 4: Get assert( opcode === TLMessages.PutFullData || opcode === TLMessages.Get, "unhandled TL A opcode found" ) Mux(opcode === TLMessages.PutFullData, true.B, false.B) } def DOpcodeIsStore(opcode: UInt): Bool = { assert( opcode === TLMessages.AccessAck || opcode === TLMessages.AccessAckData, "unhandled TL D opcode found" ) Mux(opcode === TLMessages.AccessAck, true.B, false.B) } } // `traceHasSource` is true if the input trace file has an additional source // ID column. This is useful for using the output trace file genereated by // MemTraceLogger as the driver. class MemTraceDriver( config: CoalescerConfig, filename: String, traceHasSource: Boolean = false )(implicit p: Parameters) extends LazyModule { // Create N client nodes together val laneNodes = Seq.tabulate(config.numLanes) { i => val clientParam = Seq( TLMasterParameters.v1( name = "MemTraceDriver" + i.toString, sourceId = IdRange(0, config.numOldSrcIds) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) ) TLClientNode(Seq(TLMasterPortParameters.v1(clientParam))) } // Combine N outgoing client node into 1 idenity node for diplomatic // connection. val node = TLIdentityNode() laneNodes.foreach { l => node := l } lazy val module = new MemTraceDriverImp(this, config, filename, traceHasSource) } trait HasTraceLine { val valid: UInt val source: UInt val address: UInt val is_store: UInt val size: UInt val data: UInt } // Used for both request and response. Response had address set to 0 // NOTE: these widths have to agree with what's hardcoded in Verilog. class TraceLine extends Bundle with HasTraceLine { val valid = Bool() val source = UInt(32.W) val address = UInt(64.W) // FIXME: in Verilog this is the same as data width val is_store = Bool() val size = UInt(8.W) // this is log2(bytesize) as in TL A bundle val data = UInt(64.W) } class MemTraceDriverImp( outer: MemTraceDriver, config: CoalescerConfig, filename: String, traceHasSource: Boolean ) extends LazyModuleImp(outer) with UnitTestModule { // Current cycle mark to read from trace val traceReadCycle = RegInit(1.U(64.W)) // A decoupling queue to handle backpressure from downstream. We let the // downstream take requests from the queue individually for each lane, // but do synchronized enqueue whenever all lane queue is ready to prevent // drifts between the lane. val reqQueues = Seq.fill(config.numLanes)(Module(new Queue(new TraceLine, 2))) // Are we safe to read the next warp? val reqQueueAllReady = reqQueues.map(_.io.enq.ready).reduce(_ && _) val sim = Module(new SimMemTrace(filename, config.numLanes, traceHasSource)) sim.io.clock := clock sim.io.reset := reset.asBool // 'sim.io.trace_ready.ready' is a ready signal going into the DPI sim, // indicating this Chisel module is ready to read the next line. sim.io.trace_read.ready := reqQueueAllReady sim.io.trace_read.cycle := traceReadCycle // Read output from Verilog BlackBox // Split output of SimMemTrace, which is flattened across all lanes,back to each lane's. val laneReqs = Wire(Vec(config.numLanes, new TraceLine)) val addrW = laneReqs(0).address.getWidth val sizeW = laneReqs(0).size.getWidth val dataW = laneReqs(0).data.getWidth laneReqs.zipWithIndex.foreach { case (req, i) => req.valid := sim.io.trace_read.valid(i) req.source := 0.U // driver trace doesn't contain source id req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i) req.is_store := sim.io.trace_read.is_store(i) req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i) req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i) } // Not all fire because trace cycle has to advance even when there is no valid // line in the trace. when(reqQueueAllReady) { traceReadCycle := traceReadCycle + 1.U } // Enqueue traces to the request queue (reqQueues zip laneReqs).foreach { case (reqQ, req) => // Synchronized enqueue reqQ.io.enq.valid := reqQueueAllReady && req.valid reqQ.io.enq.bits := req // FIXME duplicate valid } // Issue here is that Vortex mem range is not within Chipyard Mem range // In default setting, all mem-req for program data must be within // 0X80000000 -> 0X90000000 def hashToValidPhyAddr(addr: UInt): UInt = { Cat(8.U(4.W), addr(27, 0)) } val sourceGens = Seq.fill(config.numLanes)(Module( new RoundRobinSourceGenerator( log2Ceil(config.numOldSrcIds), ignoreInUse = false ) )) // Advance source ID for all lanes in synchrony val syncedSourceGenValid = sourceGens.map(_.io.id.valid).reduce(_ && _) // Take requests off of the queue and generate TL requests (outer.laneNodes zip reqQueues).zipWithIndex.foreach { case ((node, reqQ), lane) => val (tlOut, edge) = node.out(0) val req = reqQ.io.deq.bits // backpressure from downstream propagates into the queue reqQ.io.deq.ready := tlOut.a.ready && syncedSourceGenValid // Core only makes accesses of granularity larger than a word, so we want // the trace driver to act so as well. // That means if req.size is smaller than word size, we need to pad data // with zeros to generate a word-size request, and set mask accordingly. val offsetInWord = req.address % config.wordSizeInBytes.U val subword = req.size < log2Ceil(config.wordSizeInBytes).U // `mask` is currently unused val mask = Wire(UInt(config.wordSizeInBytes.W)) val wordData = Wire(UInt((config.wordSizeInBytes * 8 * 2).W)) val sizeInBytes = Wire(UInt((sizeW + 1).W)) sizeInBytes := (1.U) << req.size mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U) wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data) val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W) val wordAlignedSize = Mux(subword, 2.U, req.size) val sourceGen = sourceGens(lane) sourceGen.io.gen := tlOut.a.fire // assert(sourceGen.io.id.valid) sourceGen.io.reclaim.valid := tlOut.d.valid sourceGen.io.reclaim.bits := tlOut.d.bits.source val (plegal, pbits) = edge.Put( fromSource = sourceGen.io.id.bits, toAddress = hashToValidPhyAddr(wordAlignedAddress), lgSize = wordAlignedSize, // trace line already holds log2(size) // data should be aligned to beatBytes data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt ) val (glegal, gbits) = edge.Get( fromSource = sourceGen.io.id.bits, toAddress = hashToValidPhyAddr(wordAlignedAddress), lgSize = wordAlignedSize ) val legal = Mux(req.is_store, plegal, glegal) val bits = Mux(req.is_store, pbits, gbits) tlOut.a.valid := reqQ.io.deq.valid && syncedSourceGenValid when(tlOut.a.valid) { assert(legal, "illegal TL req gen") } tlOut.a.bits := bits tlOut.b.ready := true.B tlOut.c.valid := false.B tlOut.d.ready := true.B tlOut.e.valid := false.B // debug dontTouch(reqQ.io.enq) dontTouch(reqQ.io.deq) when(tlOut.a.valid) { TLPrintf( "MemTraceDriver", tlOut.a.bits.source, tlOut.a.bits.address, tlOut.a.bits.size, tlOut.a.bits.mask, req.is_store, tlOut.a.bits.data, req.data ) } dontTouch(tlOut.a) dontTouch(tlOut.d) } // Give some slack time after trace EOF to the downstream system to make sure // we receive all (hopefully) outstanding responses back. val finishCounter = RegInit(200.U(64.W)) when(sim.io.trace_read.finished) { finishCounter := finishCounter - 1.U } io.finished := (finishCounter === 0.U) when(io.finished) { assert( false.B, "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)" ) } } class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean) extends BlackBox( Map( "FILENAME" -> filename, "NUM_LANES" -> numLanes, "HAS_SOURCE" -> (if (traceHasSource) 1 else 0) ) ) with HasBlackBoxResource { val traceLineT = new TraceLine val addrW = traceLineT.address.getWidth val sizeW = traceLineT.size.getWidth val dataW = traceLineT.data.getWidth val io = IO(new Bundle { val clock = Input(Clock()) val reset = Input(Bool()) // These names have to match declarations in the Verilog code, eg. // trace_read_address. val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source val ready = Input(Bool()) val valid = Output(UInt(numLanes.W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into // single wide 1D array. // TODO: assumes 64-bit address. val cycle = Input(UInt(64.W)) val address = Output(UInt((addrW * numLanes).W)) val is_store = Output(UInt(numLanes.W)) val size = Output(UInt((sizeW * numLanes).W)) val data = Output(UInt((dataW * numLanes).W)) val finished = Output(Bool()) } }) addResource("/vsrc/SimMemTrace.v") addResource("/csrc/SimMemTrace.cc") addResource("/csrc/SimMemTrace.h") } class MemTraceLogger( numLanes: Int, // base filename for the generated trace files. full filename will be // suffixed depending on `reqEnable`/`respEnable`/`loggerName`. filename: String, reqEnable: Boolean = true, respEnable: Boolean = true, // filename suffix that is unique to this logger module. // This will be appended to the filename of the generated trace. loggerName: String = ".logger" )(implicit p: Parameters ) extends LazyModule { val node = TLIdentityNode() // val beatBytes = 8 // FIXME: hardcoded // val node = TLManagerNode(Seq.tabulate(numLanes) { _ => // TLSlavePortParameters.v1( // Seq( // TLSlaveParameters.v1( // address = List(AddressSet(0x0000, 0xffffff)), // FIXME: hardcoded // supportsGet = TransferSizes(1, beatBytes), // supportsPutPartial = TransferSizes(1, beatBytes), // supportsPutFull = TransferSizes(1, beatBytes) // ) // ), // beatBytes = beatBytes // ) // }) // Copied from freechips.rocketchip.trailingZeros which only supports Scala // integers def trailingZeros(x: UInt): UInt = { Mux(x === 0.U, x.widthOption.get.U, Log2(x & -x)) } lazy val module = new Impl class Impl extends LazyModuleImp(this) { val io = IO(new Bundle { val numReqs = Output(UInt(64.W)) val numResps = Output(UInt(64.W)) val reqBytes = Output(UInt(64.W)) val respBytes = Output(UInt(64.W)) }) val numReqs = RegInit(0.U(64.W)) val numResps = RegInit(0.U(64.W)) val reqBytes = RegInit(0.U(64.W)) val respBytes = RegInit(0.U(64.W)) io.numReqs := numReqs io.numResps := numResps io.reqBytes := reqBytes io.respBytes := respBytes val simReq = if (reqEnable) Some(Module(new SimMemTraceLogger(false, s"${filename}.${loggerName}.req", numLanes))) else None val simResp = if (respEnable) Some(Module(new SimMemTraceLogger(true, s"${filename}.${loggerName}.resp", numLanes))) else None if (simReq.isDefined) { simReq.get.io.clock := clock simReq.get.io.reset := reset.asBool } if (simResp.isDefined) { simResp.get.io.clock := clock simResp.get.io.reset := reset.asBool } val laneReqs = Wire(Vec(numLanes, new TraceLine)) val laneResps = Wire(Vec(numLanes, new TraceLine)) assert( numLanes == node.in.length, "`numLanes` does not match the number of TL edges connected to the MemTraceLogger" ) // snoop on the TileLink edges to log traffic ((node.in zip node.out) zip (laneReqs zip laneResps)).foreach { case (((tlIn, _), (tlOut, _)), (req, resp)) => tlOut.a <> tlIn.a tlIn.d <> tlOut.d // requests on TL A channel // // Only log trace when fired, e.g. both upstream and downstream is ready // and transaction happened. req.valid := tlIn.a.fire req.size := tlIn.a.bits.size req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) req.source := tlIn.a.bits.source // TL always carries the exact unaligned address that the client // originally requested, so no postprocessing required req.address := tlIn.a.bits.address when(req.valid) { TLPrintf( s"MemTraceLogger (${loggerName}:downstream)", tlIn.a.bits.source, tlIn.a.bits.address, tlIn.a.bits.size, tlIn.a.bits.mask, req.is_store, tlIn.a.bits.data, req.data ) } // TL data // // When tlIn.a.bits.size is smaller than the data bus width, need to // figure out which byte lanes we actually accessed so that // we can write that to the memory trace. // See Section 4.5 Byte Lanes in spec 1.8.1 // This assert only holds true for PutFullData and not PutPartialData, // where HIGH bits in the mask may not be contiguous. when(tlIn.a.valid) { assert( PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), "mask HIGH popcount do not match the TL size. " + "Partial masks are not allowed for PutFull" ) } val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) val dataW = tlIn.params.dataBits val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U)) req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) // when (req.valid) { // printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data) // } // responses on TL D channel // // Only log trace when fired, e.g. both upstream and downstream is ready // and transaction happened. resp.valid := tlOut.d.fire resp.size := tlOut.d.bits.size resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode) resp.source := tlOut.d.bits.source // NOTE: TL D channel doesn't carry address nor mask, so there's no easy // way to figure out which bytes the master actually use. Since we // don't care too much about addresses in the trace anyway, just store // the entire bits. resp.address := 0.U resp.data := tlOut.d.bits.data } // stats val numReqsThisCycle = laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 } val numRespsThisCycle = laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 } val reqBytesThisCycle = laneReqs .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) } .reduce { (b0, b1) => b0 + b1 } val respBytesThisCycle = laneResps .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) } .reduce { (b0, b1) => b0 + b1 } numReqs := numReqs + numReqsThisCycle numResps := numResps + numRespsThisCycle reqBytes := reqBytes + reqBytesThisCycle respBytes := respBytes + respBytesThisCycle // Flatten per-lane signals to the Verilog blackbox input. // // This is a clunky workaround of the fact that Chisel doesn't allow partial // assignment to a bitfield range of a wide signal. def flattenTrace( simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine] ) = { // these will get optimized out val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid))) val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source))) val vecAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address))) val vecIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store))) val vecSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size))) val vecData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data))) perLane.zipWithIndex.foreach { case (l, i) => vecValid(i) := l.valid vecSource(i) := l.source vecAddress(i) := l.address vecIsStore(i) := l.is_store vecSize(i) := l.size vecData(i) := l.data } simIO.valid := vecValid.asUInt simIO.source := vecSource.asUInt simIO.address := vecAddress.asUInt simIO.is_store := vecIsStore.asUInt simIO.size := vecSize.asUInt simIO.data := vecData.asUInt } if (simReq.isDefined) { flattenTrace(simReq.get.io.trace_log, laneReqs) assert( simReq.get.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready" ) } if (simResp.isDefined) { flattenTrace(simResp.get.io.trace_log, laneResps) assert( simResp.get.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready" ) } } } // MemTraceLogger is bidirectional, and `isResponse` is how the DPI module tells // itself whether it's logging the request stream or the response stream. This // is necessary because we have to generate slightly different trace format // depending on this, e.g. response trace will not contain an address column. class SimMemTraceLogger(isResponse: Boolean, filename: String, numLanes: Int) extends BlackBox( Map( "IS_RESPONSE" -> (if (isResponse) 1 else 0), "FILENAME" -> filename, "NUM_LANES" -> numLanes ) ) with HasBlackBoxResource { val traceLineT = new TraceLine val sourceW = traceLineT.source.getWidth val addrW = traceLineT.address.getWidth val sizeW = traceLineT.size.getWidth val dataW = traceLineT.data.getWidth val io = IO(new Bundle { val clock = Input(Clock()) val reset = Input(Bool()) val trace_log = new Bundle with HasTraceLine { val valid = Input(UInt(numLanes.W)) val source = Input(UInt((sourceW * numLanes).W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into // single wide 1D array. // TODO: assumes 64-bit address. val address = Input(UInt((addrW * numLanes).W)) val is_store = Input(UInt(numLanes.W)) val size = Input(UInt((sizeW * numLanes).W)) val data = Input(UInt((dataW * numLanes).W)) val ready = Output(Bool()) } }) addResource("/vsrc/SimMemTraceLogger.v") addResource("/csrc/SimMemTraceLogger.cc") addResource("/csrc/SimMemTrace.h") } class TLPrintf {} object TLPrintf { def apply( printer: String, source: UInt, address: UInt, size: UInt, mask: UInt, is_store: Bool, tlData: UInt, reqData: UInt ) = { printf( s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d", source, address, size, mask, is_store ) when(is_store) { printf(", tlData=%x, reqData=%x", tlData, reqData) } printf("\n") } } // Synthesizable unit tests class DummyDriver(config: CoalescerConfig)(implicit p: Parameters) extends LazyModule { val laneNodes = Seq.tabulate(config.numLanes) { i => val clientParam = Seq( TLMasterParameters.v1( name = "dummy-core-node-" + i.toString, sourceId = IdRange(0, config.numOldSrcIds) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) ) TLClientNode(Seq(TLMasterPortParameters.v1(clientParam))) } // Combine N outgoing client node into 1 idenity node for diplomatic // connection. val node = TLIdentityNode() laneNodes.foreach { l => node := l } lazy val module = new DummyDriverImp(this, config) } class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig) extends LazyModuleImp(outer) with UnitTestModule { val sourceIdCounter = RegInit(0.U(log2Ceil(config.numOldSrcIds).W)) sourceIdCounter := sourceIdCounter + 1.U val finishCounter = RegInit(10000.U(64.W)) finishCounter := finishCounter - 1.U io.finished := (finishCounter === 0.U) outer.laneNodes.zipWithIndex.foreach { case (node, lane) => assert(node.out.length == 1) // generate dummy traffic to coalescer to prevent it from being optimized // out during synthesis val address = Wire(UInt(config.addressWidth.W)) address := Cat( (finishCounter + (lane.U % 3.U)), 0.U(config.wordSizeWidth.W) ) val (tl, edge) = node.out(0) val (legal, bits) = edge.Put( fromSource = sourceIdCounter, toAddress = address, lgSize = 2.U, data = finishCounter + (lane.U % 3.U) ) assert(legal, "illegal TL req gen") tl.a.valid := true.B tl.a.bits := bits tl.b.ready := true.B tl.c.valid := false.B tl.d.ready := true.B tl.e.valid := false.B } val dataSum = outer.laneNodes .map { node => val tl = node.out(0)._1 val data = Mux(tl.d.valid, tl.d.bits.data, 0.U) data } .reduce(_ +& _) // this doesn't make much sense, but it prevents the entire uncoalescer from // being optimized away finishCounter := finishCounter + dataSum } // A dummy harness around the coalescer for use in VLSI flow. // Should not instantiate any memtrace modules. class DummyCoalescer(implicit p: Parameters) extends LazyModule { val numLanes = p(SIMTCoreKey).get.nLanes println(s"============ numLanes: ${numLanes}") val config = defaultConfig.copy(numLanes = numLanes) val driver = LazyModule(new DummyDriver(config)) val rams = Seq.fill(config.numLanes + 1)( // +1 for coalesced edge LazyModule( // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. new TLRAM( address = AddressSet(0x0000, 0xffffff), beatBytes = (1 << config.dataBusWidth) ) ) ) val coal = LazyModule(new CoalescingUnit(config)) coal.cpuNode :=* driver.node rams.foreach(_.node := coal.aggregateNode) lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { io.finished := driver.module.io.finished } } class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { val dut = Module(LazyModule(new DummyCoalescer).module) dut.io.start := io.start io.finished := dut.io.finished } // tracedriver --> coalescer --> tracelogger --> tlram class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { val numLanes = p(SIMTCoreKey).get.nLanes val config = defaultConfig.copy(numLanes = numLanes) val driver = LazyModule(new MemTraceDriver(config, filename)) val coreSideLogger = LazyModule( new MemTraceLogger(numLanes, filename, loggerName = "coreside") ) val coal = LazyModule(new CoalescingUnit(config)) val memSideLogger = LazyModule( new MemTraceLogger(numLanes + 1, filename, loggerName = "memside") ) val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. new TLRAM( address = AddressSet(0x0000, 0xffffff), beatBytes = (1 << config.dataBusWidth) ) ) ) memSideLogger.node :=* coal.aggregateNode coal.cpuNode :=* coreSideLogger.node :=* driver.node rams.foreach { r => r.node := memSideLogger.node } lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { driver.module.io.start := io.start io.finished := driver.module.io.finished when(io.finished) { printf( "numReqs=%d, numResps=%d, reqBytes=%d, respBytes=%d\n", coreSideLogger.module.io.numReqs, coreSideLogger.module.io.numResps, coreSideLogger.module.io.reqBytes, coreSideLogger.module.io.respBytes ) assert( (coreSideLogger.module.io.numReqs === coreSideLogger.module.io.numResps) && (coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes), "FAIL: requests and responses traffic to the coalescer do not match" ) printf("SUCCESS: coalescer response traffic matched requests!\n") } } } class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters ) extends UnitTest(timeout) { val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module) dut.io.start := io.start io.finished := dut.io.finished } // tracedriver --> coalescer --> tlram class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = 4 val filename = "vecadd.core1.thread4.trace" val coal = LazyModule(new CoalescingUnit(defaultConfig)) val driver = LazyModule(new MemTraceDriver(defaultConfig, filename)) val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. new TLRAM( address = AddressSet(0x0000, 0xffffff), beatBytes = (1 << defaultConfig.dataBusWidth) ) ) ) coal.cpuNode :=* driver.node rams.foreach { r => r.node := coal.aggregateNode } lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { driver.module.io.start := io.start io.finished := driver.module.io.finished } } class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { val dut = Module(LazyModule(new TLRAMCoalescer).module) dut.io.start := io.start io.finished := dut.io.finished } //////////// //////////// //////////// //////////// Code for CoalescerXbar //////////// //////////// // Lazy Module is needed to instantiate outgoing node class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends LazyModule { // Let SIMT's word size be 32, and read/write granularity be 256 // 32 client nodes of edge size 32 for non-coalesced reqs // And attaching them wigets val nonCoalNarrowNodes = Seq.tabulate(config.numLanes){i => val nonCoalNarrowParam = Seq( TLMasterParameters.v1( name = "NonCoalNarrowNode" + i.toString, sourceId = IdRange(0, config.numOldSrcIds) ) ) TLClientNode(Seq(TLMasterPortParameters.v1(nonCoalNarrowParam))) } val nonCoalWidgets = Seq.tabulate(config.numLanes){ _=> TLWidthWidget(config.wordSizeInBytes) } (nonCoalWidgets zip nonCoalNarrowNodes).foreach{ case(wgt,node)=> wgt := node } //Creating a round robin cross tilelink xbar for the un-coalesced //and connect them to the widgets val nonCoalXbar = LazyModule(new TLXbar(TLArbiter.roundRobin)) nonCoalWidgets.foreach{nonCoalXbar.node:=_} // K client nodes of edge size 256 for the coalesced reqs val coalReqNodes = Seq.tabulate(config.numCoalReqs){ i => val coalParam = Seq( TLMasterParameters.v1( name = "CoalReqNode" + i.toString, sourceId = IdRange(0, config.numNewSrcIds) ) ) TLClientNode(Seq(TLMasterPortParameters.v1(coalParam))) } // Create a RR Xbar for the coalesced request val coalXbar = LazyModule(new TLXbar(TLArbiter.roundRobin)) coalReqNodes.foreach{coalXbar.node:=_} //Create a Priority XBar between Coalesced and Uncoalesced Request val outputXbar = LazyModule(new TLXbar(TLArbiter.lowestIndexFirst)) outputXbar.node :=* coalXbar.node outputXbar.node :=* nonCoalXbar.node //express output crossbar as an idenity node for simpler downstream connection val node = TLIdentityNode() node :=* outputXbar.node val nonCoalEntryT = new NonCoalescedRequest(config) val coalEntryT = new CoalescedRequest(config) val respNonCoalEntryT = new NonCoalescedResponse(config) val respCoalBundleT = new CoalescedResponse(config) lazy val module = new CoalescerXbarImpl( this, config, nonCoalEntryT, coalEntryT, respNonCoalEntryT, respCoalBundleT) } class CoalescerXbarImpl(outer: CoalescerXbar, config: CoalescerConfig, nonCoalEntryT: Request, coalEntryT: Request, respNonCoalEntryT: Response, respCoalBundleT: CoalescedResponse ) extends LazyModuleImp(outer){ val io = IO(new Bundle { val nonCoalReqs = Vec(config.numLanes, Flipped(Decoupled(nonCoalEntryT))) val coalReqs = Vec(config.numCoalReqs, Flipped(Decoupled(coalEntryT))) val nonCoalResps = Vec(config.numLanes, Decoupled(respNonCoalEntryT)) val coalResp = Decoupled(respCoalBundleT) } ) //Create Queues to receive data from upstream //Stage 1: Create Queue for nonCoalReqs and CoalReqs val nonCoalReqsQueues = Seq.tabulate(config.numLanes){_=> Module(new Queue(nonCoalEntryT.cloneType, 1, true, false)) } val coalReqsQueues = Seq.tabulate(config.numCoalReqs){_=> Module(new Queue(coalEntryT.cloneType, 1, true, false)) } //Stage 1a: connect two Queue groups to the input (io.nonCoalReqs++io.coalReqs zip nonCoalReqsQueues++coalReqsQueues).foreach{ case (req, q) => q.io.enq <> req } //Stage 2: connect output of the queue to the respective Node (nonCoalReqsQueues++coalReqsQueues zip outer.nonCoalNarrowNodes++outer.coalReqNodes).foreach{ case(q, node) => val (tlOut, edgeOut) = node.out(0) q.io.deq.ready := tlOut.a.ready tlOut.a.valid := q.io.deq.valid tlOut.a.bits := q.io.deq.bits.toTLA(edgeOut) } //The XBar will take care of the rest // // Inward data handling // // For the uncoalesced data response (outer.nonCoalNarrowNodes zip io.nonCoalResps).foreach{ case(node,resp) => val (tlOut, edgeOut) = node.out(0) val nonCoalResp = Wire(respNonCoalEntryT) nonCoalResp.fromTLD(tlOut.d.bits) tlOut.d.ready := resp.ready resp.valid := tlOut.d.valid resp.bits := nonCoalResp } //For the coalesced data response //Have an RR arbiter that holds the response data val coalRespRRArbiter = Module(new RRArbiter( outer.node.in(0)._1.d.bits.cloneType, config.numCoalReqs) ) outer.coalReqNodes.zipWithIndex.foreach{ case(node, idx) => val (tlOut, edgeOut) = node.out(0) coalRespRRArbiter.io.in(idx) <> tlOut.d } //Connect output of arbiter to coalesced reponse output io.coalResp.valid := coalRespRRArbiter.io.out.valid coalRespRRArbiter.io.out.ready := io.coalResp.ready val coalRespBundle = Wire(respCoalBundleT) coalRespBundle.fromTLD(coalRespRRArbiter.io.out.bits) io.coalResp.bits := coalRespBundle }