Files
radiance/src/main/scala/tilelink/Coalescing.scala
2023-05-10 00:26:25 -07:00

2195 lines
82 KiB
Scala

// See LICENSE.SiFive for license details.
package freechips.rocketchip.tilelink
import chisel3._
import chisel3.util._
import chisel3.experimental.ChiselEnum
import org.chipsalliance.cde.config.{Parameters, Field}
import freechips.rocketchip.diplomacy._
// import freechips.rocketchip.devices.tilelink.TLTestRAM
import freechips.rocketchip.util.MultiPortQueue
import freechips.rocketchip.unittest._
// TODO: find better place for these
case class SIMTCoreParams(nLanes: Int = 4)
case class MemtraceCoreParams(tracefilename: String = "undefined", traceHasSource: Boolean = false)
case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/)
case object MemtraceCoreKey extends Field[Option[MemtraceCoreParams]](None /*default*/)
trait InFlightTableSizeEnum extends ChiselEnum {
val INVALID: Type
val FOUR: Type
def logSizeToEnum(x: UInt): Type
def enumToLogSize(x: Type): UInt
}
object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum {
val INVALID = Value(0.U)
val FOUR = Value(1.U)
def logSizeToEnum(x: UInt): Type = {
MuxCase(INVALID, Seq(
(x === 2.U) -> FOUR
))
}
def enumToLogSize(x: Type): UInt = {
MuxCase(0.U, Seq(
(x === FOUR) -> 2.U
))
}
}
case class CoalescerConfig(
enable: Boolean, // globally enable or disable coalescing
numLanes: Int, // number of lanes (or threads) in a warp
queueDepth: Int, // request window per lane
waitTimeout: Int, // max cycles to wait before forced fifo dequeue, per lane
addressWidth: Int, // assume <= 32
dataBusWidth: Int, // memory-side downstream TileLink data bus size
// this has to be at least larger than the word size for
// the coalescer to perform well
// watermark = 2, // minimum buffer occupancy to start coalescing
wordSizeInBytes: Int, // 32-bit system
wordWidth: Int, // log(WORD_SIZE)
numOldSrcIds: Int, // num of outstanding requests per lane, from processor
numNewSrcIds: Int, // num of outstanding coalesced requests
respQueueDepth: Int, // depth of the response fifo queues
coalLogSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers
// each size is log(byteSize)
sizeEnum: InFlightTableSizeEnum,
numCoalReqs: Int, // total number of coalesced requests we can generate in one cycle
numArbiterOutputPorts: Int, // total of output ports the arbiter will arbitrate into.
// this has to match downstream cache's configuration
bankStrideInBytes: Int // cache line strides across the different banks
) {
// maximum coalesced size
def maxCoalLogSize: Int = coalLogSizes.max
}
object defaultConfig extends CoalescerConfig(
enable = true,
numLanes = 4,
queueDepth = 1,
waitTimeout = 8,
addressWidth = 24,
dataBusWidth = 3, // 2^3=8 bytes, 64 bit bus
// watermark = 2,
wordSizeInBytes = 4,
wordWidth = 2,
// when attaching to SoC, 16 source IDs are not enough due to longer latency
numOldSrcIds = 16,
numNewSrcIds = 4,
respQueueDepth = 4,
coalLogSizes = Seq(3),
sizeEnum = DefaultInFlightTableSizeEnum,
numCoalReqs = 1,
numArbiterOutputPorts = 4,
bankStrideInBytes = 64 // Current L2 is strided by 512 bits
)
class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends LazyModule {
// Nexus node that captures the incoming TL requests, rewrites coalescable requests,
// and arbitrates between non-coalesced and coalesced requests to a fix number of outputs
// before sending it out to memory. This node is what's visible to upstream and downstream nodes.
// WIP:
// val node = TLNexusNode(
// clientFn = c => c.head,
// managerFn = m => m.head // assuming arbiter generated ids are distinct between edges
// )
// node.in.map(_._2).foreach(edge => require(edge.manager.beatBytes == config.wordSizeInBytes,
// s"input edges into coalescer node does not have beatBytes = ${config.wordSizeInBytes}"))
// node.out.map(_._2).foreach(edge => require(edge.manager.beatBytes == config.maxCoalLogSize,
// s"output edges into coalescer node does not have beatBytes = ${config.maxCoalLogSize}"))
val aggregateNode = TLIdentityNode()
val cpuNode = TLIdentityNode()
// Number of maximum in-flight coalesced requests. The upper bound of this
// value would be the sourceId range of a single lane.
val numInflightCoalRequests = config.numNewSrcIds
// Master node that actually generates coalesced requests.
protected val coalParam = Seq(
TLMasterParameters.v1(
name = "CoalescerNode",
sourceId = IdRange(0, numInflightCoalRequests)
)
)
val coalescerNode = TLClientNode(
Seq(TLMasterPortParameters.v1(coalParam))
)
// merge coalescerNode and cpuNode
aggregateNode :=* coalescerNode
aggregateNode :=* TLWidthWidget(config.wordSizeInBytes) :=* cpuNode
lazy val module = new CoalescingUnitImp(this, config)
}
class ReqQueueEntry(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, maxSize: Int) extends Bundle {
val op = UInt(1.W) // 0=READ 1=WRITE
val address = UInt(addressWidth.W)
val size = UInt(sizeWidth.W)
val source = UInt(sourceWidth.W)
val mask = UInt((1 << maxSize).W) // write only
val data = UInt((8 * (1 << maxSize)).W) // write only
def toTLA(edgeOut: TLEdgeOut): TLBundleA = {
val (plegal, pbits) = edgeOut.Put(
fromSource = this.source,
toAddress = this.address,
lgSize = this.size,
data = this.data,
)
val (glegal, gbits) = edgeOut.Get(
fromSource = this.source,
toAddress = this.address,
lgSize = this.size
)
val legal = Mux(this.op.asBool, plegal, glegal)
val bits = Mux(this.op.asBool, pbits, gbits)
assert(legal, "unhandled illegal TL req gen")
bits
}
}
class RespQueueEntry(sourceWidth: Int, sizeWidth: Int, maxSize: Int) extends Bundle {
val op = UInt(1.W) // 0=READ 1=WRITE
val size = UInt(sizeWidth.W)
val source = UInt(sourceWidth.W)
val data = UInt((8 * (1 << maxSize)).W) // read only
val error = Bool()
def toTLD(edgeIn: TLEdgeIn): TLBundleD = {
val apBits = edgeIn.AccessAck(
toSource = this.source,
lgSize = this.size
)
val agBits = edgeIn.AccessAck(
toSource = this.source,
lgSize = this.size,
data = this.data
)
Mux(this.op.asBool, apBits, agBits)
}
def fromTLD(bundle: TLBundleD): Unit = {
this.source := bundle.source
this.op := TLUtils.DOpcodeIsStore(bundle.opcode)
this.size := bundle.size
this.data := bundle.data
this.error := bundle.denied
}
}
// If `ignoreInUse`, just keep giving out new IDs without checking if it is in
// use.
class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true) extends Module {
val io = IO(new Bundle {
val gen = Input(Bool())
val reclaim = Input(Valid(UInt(sourceWidth.W)))
val id = Output(Valid(UInt(sourceWidth.W)))
})
val head = RegInit(UInt(sourceWidth.W), 0.U)
head := Mux(io.gen, head + 1.U, head)
val numSourceId = 1 << sourceWidth
// true: in use, false: available
val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W)))
when(reset.asBool) {
(0 until numSourceId).foreach { i => occupancyTable(i).valid := false.B }
}
io.id.valid := (if (ignoreInUse) true.B else !occupancyTable(head).valid)
io.id.bits := head
when (io.gen && io.id.valid /* fire */) {
occupancyTable(io.id.bits).valid := true.B // mark in use
}
when (io.reclaim.valid) {
occupancyTable(io.reclaim.bits).valid := false.B // mark freed
}
}
class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) extends Module {
val io = IO(new Bundle {
val queue = new Bundle {
val enq = Vec(config.numLanes, DeqIO(gen.cloneType))
val deq = Vec(config.numLanes, EnqIO(gen.cloneType))
}
val invalidate = Input(Valid(Vec(config.numLanes, UInt(entries.W))))
val coalescable = Input(Vec(config.numLanes, Bool()))
val mask = Output(Vec(config.numLanes, UInt(entries.W)))
val elts = Output(Vec(config.numLanes, Vec(entries, gen)))
})
// val eltPrototype = Wire(Valid(gen))
// eltPrototype.bits := DontCare
// eltPrototype.valid := false.B
val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen))))
val writePtr = RegInit(VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))))
val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B)))
private def resetElts = {
elts.foreach { laneQ =>
laneQ.foreach { entry =>
entry.valid := false.B
entry.bits := DontCare
}
}
}
when (reset.asBool) {
resetElts
}
val controlSignals = Wire(Vec(config.numLanes, new Bundle {
val shift = Bool()
val full = Bool()
val empty = Bool()
}))
// shift hint is when the heads have no more coalescable left this or next cycle
val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) =>
c && !(io.invalidate.valid && inv)
}.reduce(_ || _)
val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
// valid && !fire means we enable enqueueing to a full queue, provided the
// arbiter is taking away all remaining valid queue heads in the next cycle so
// that we make space for the entire next warp.
val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
for (i <- 0 until config.numLanes) {
val enq = io.queue.enq(i)
val deq = io.queue.deq(i)
val ctrl = controlSignals(i)
ctrl.full := writePtr(i) === entries.U
ctrl.empty := writePtr(i) === 0.U
// shift when no outstanding dequeue, no more coalescable chunks, and not empty
ctrl.shift := !syncedDeqValidNextCycle && shiftHint && !ctrl.empty
// dequeue is valid when:
// head entry is valid, has not been processed by downstream, and is not coalescable
deq.bits := elts.map(_.head.bits)(i)
deq.valid := elts.map(_.head.valid)(i) && !deqDone(i) && !io.coalescable(i)
// can take new entries if not empty, or if full but shifting
enq.ready := (!ctrl.full) || ctrl.shift
when (ctrl.shift) {
// shift, invalidate tail, invalidate coalesced requests
elts(i).zipWithIndex.foreach { case (elt, j) =>
if (j == entries - 1) { // tail
elt.valid := false.B
} else {
elt.bits := elts(i)(j + 1).bits
elt.valid := elts(i)(j + 1).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
}
}
// reset dequeue mask when new entries are shifted in
deqDone(i) := false.B
// enqueue
when (enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
elts(i)(writePtr(i) - 1.U).bits := enq.bits
elts(i)(writePtr(i) - 1.U).valid := enq.valid
}.otherwise {
writePtr(i) := writePtr(i) - 1.U
}
}.otherwise {
// invalidate coalesced requests
when (io.invalidate.valid) {
(elts(i) zip io.invalidate.bits(i).asBools).map { case (elt, inv) =>
elt.valid := elt.valid && !inv
}
}
// enqueue
when (enq.ready && syncedEnqValid) {
elts(i)(writePtr(i)).bits := enq.bits
elts(i)(writePtr(i)).valid := enq.valid
writePtr(i) := writePtr(i) + 1.U
}
deqDone(i) := deqDone(i) || deq.fire
}
}
// When doing spatial-only coalescing, queues should never drift from each
// other, i.e. the queue heads should always contain mem requests from the
// same instruction.
val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
writePtr.map(_ === writePtr.head).reduce(_ && _)
assert(queueInSync, "shift queue lanes are not in sync")
io.mask := elts.map(x => VecInit(x.map(_.valid)).asUInt)
io.elts := elts.map(x => VecInit(x.map(_.bits)))
}
// Software model: coalescer.py
class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
config: CoalescerConfig) extends Module {
val io = IO(new Bundle {
val window = Input(windowT.io.cloneType)
val results = Output(new Bundle {
val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W))
val baseAddr = Output(UInt(config.addressWidth.W))
val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
// number of entries matched with this leader lane's head.
// maximum is numLanes * queueDepth
val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
val coverageHits = Output(UInt((config.maxCoalLogSize - config.wordWidth + 1).W))
val canCoalesce = Output(Vec(config.numLanes, Bool()))
})
})
io := DontCare
// Combinational logic to drive output from window contents.
// The leader lanes only compare their heads against all entries of the
// follower lanes.
val leaders = io.window.elts.map(_.head)
val leadersValid = io.window.mask.map(_.asBools.head)
def printQueueHeads = {
leaders.zipWithIndex.foreach{ case (head, i) =>
printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
leadersValid(i), head.source, head.address)
}
}
// when (leadersValid.reduce(_ || _)) {
// printQueueHeads
// }
val size = coalLogSize
val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = {
(req0.op === req1.op) &&
(req0v && req1v) &&
((req0.address & this.addrMask) === (req1.address & this.addrMask))
}
// Gives a 2-D table of Bools representing match at every queue entry,
// for each lane (so 3-D in total).
// dimensions: (leader lane, follower lane, follower entry)
val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) =>
(io.window.elts zip io.window.mask).map { case (followers, followerValids) =>
// compare leader's head against follower's every queue entry
(followers zip followerValids.asBools).map { case (follower, followerValid) =>
canMatch(follower, followerValid, leader, leaderValid)
// FIXME: disabling halving optimization because it does not give the
// correct per-lane coalescable indication to the shift queue
// // match leader to only followers at lanes >= leader idx
// // this halves the number of comparators
// if (followerIndex < leaderIndex) false.B
// else canMatch(follower, followerValid, leader, leaderValid)
}
}
}
val matchCounts = matchTablePerLane.map(table =>
table.map(PopCount(_)) // sum up each column
.reduce(_ +& _))
val canCoalesce = matchCounts.map(_ > 1.U)
// Elect the leader that has the most match counts.
// TODO: potentially expensive: magnitude comparator
def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
matchCounts.zipWithIndex.map {
case (c, i) => (c, i.U)
}.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
(Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
}._2
}
// Elect leader by choosing the smallest-index lane that has a valid
// match, i.e. using priority encoder.
def chooseLeaderPriorityEncoder(matchCounts: Seq[UInt]): UInt = {
PriorityEncoder(matchCounts.map(_ > 1.U))
}
val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts)
val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux
// matchTable for the chosen lane, but converted to a Vec[UInt]
val chosenMatches = VecInit(matchTablePerLane.map{ table =>
VecInit(table.map(VecInit(_).asUInt))
})(chosenLeaderIdx)
val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)
// coverage calculation
def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth)
// 2-D table flattened to 1-D
val offsets = io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
val valids = chosenMatches.flatMap(_.asBools)
// indicates for each word in the coalesced chunk whether it is accessed by
// any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four
// words in the coalesced data coming back will be accessed by some request
// and we've reached 100% bandwidth utilization.
val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target =>
(offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
}
// debug prints
when (leadersValid.reduce(_ || _)) {
matchCounts.zipWithIndex.foreach { case (count, i) =>
printf(s"lane[${i}] matchCount = %d\n", count);
}
printf("chosenLeader = lane %d\n", chosenLeaderIdx)
printf("chosenLeader matches = [ ")
chosenMatches.foreach { m => printf("%d ", m) }
printf("]\n")
printf("chosenMatchCount = %d\n", chosenMatchCount)
printf("hits = [ ")
hits.foreach { m => printf("%d ", m) }
printf("]\n")
}
io.results.leaderIdx := chosenLeaderIdx
io.results.baseAddr := chosenLeader.address & addrMask
io.results.matchOH := chosenMatches
io.results.matchCount := chosenMatchCount
io.results.coverageHits := PopCount(hits)
io.results.canCoalesce := canCoalesce
}
// Combinational logic that generates a coalesced request given a request
// window, and a selection of possible coalesced sizes. May utilize multiple
// MonoCoalescers and apply size-choosing policy to determine the final
// coalesced request out of all possible combinations.
//
// Software model: coalescer.py
class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry,
config: CoalescerConfig) extends Module {
val io = IO(new Bundle {
// coalescing window, connected to the contents of the request queues
val window = Input(windowT.io.cloneType)
// generated coalesced request
val coalReq = DecoupledIO(coalReqT.cloneType)
// invalidate signals going into each request queue's head
val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
// whether a lane is coalescable
val coalescable = Output(Vec(config.numLanes, Bool()))
})
val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
coalescers.foreach(_.io.window := io.window)
def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
(valPerSize zip config.coalLogSizes).map { case (hits, size) =>
(hits << (config.maxCoalLogSize - size).U).asUInt
}
}
def argMax(x: Seq[UInt]): UInt = {
x.zipWithIndex.map {
case (a, b) => (a, b.U)
}.reduce[(UInt, UInt)] { case ((a, i), (b, j)) =>
(Mux(a > b, a, b), Mux(a > b, i, j)) // > instead of >= here; want to use largest size
}._2
}
// normalize to maximum coalescing size so that we can do fair comparisons
// between coalescing results of different sizes
val normalizedMatches = normalize(coalescers.map(_.io.results.matchCount))
val normalizedHits = normalize(coalescers.map(_.io.results.coverageHits))
val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
val chosenValid = Wire(Bool())
// minimum 25% coverage
val minCoverage = 1.max(1 << ((config.maxCoalLogSize - config.wordWidth) - 2))
when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
chosenSizeIdx := argMax(normalizedHits)
chosenValid := true.B
printf("coalescing success by coverage policy\n")
}.elsewhen(normalizedMatches.map(_ > 1.U).reduce(_ || _)) {
chosenSizeIdx := argMax(normalizedMatches)
chosenValid := true.B
printf("coalescing success by matches policy\n")
}.otherwise {
chosenSizeIdx := DontCare
chosenValid := false.B
}
def debugPolicyPrint() = {
printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount)
printf("normalizedMatches[0]=%d\n", normalizedMatches(0))
printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits)
printf("normalizedHits[0]=%d\n", normalizedHits(0))
printf("minCoverage=%d\n", minCoverage.U)
}
// create coalesced request
val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenSizeIdx)
val chosenSize = VecInit(coalescers.map(_.size.U))(chosenSizeIdx)
// flatten requests and matches
val flatReqs = io.window.elts.flatten
val flatMatches = chosenBundle.matchOH.flatMap(_.asBools)
// check for word alignment in addresses
assert(io.window.elts.flatMap(_.map(req => req.address(config.wordWidth - 1, 0) === 0.U)).zip(
io.window.mask.flatMap(_.asBools)).map { case (aligned, valid) => (!valid) || aligned }.reduce(_ || _),
"one or more addresses used for coalescing is not word-aligned")
// note: this is word-level coalescing. if finer granularity is needed, need to modify code
val numWords = (1.U << (chosenSize - config.wordWidth.U)).asUInt
val maxWords = 1 << (config.maxCoalLogSize - config.wordWidth)
val addrMask = Wire(UInt(config.maxCoalLogSize.W))
addrMask := (1.U << chosenSize).asUInt - 1.U
val data = Wire(Vec(maxWords, UInt((config.wordSizeInBytes * 8).W)))
val mask = Wire(Vec(maxWords, UInt(config.wordSizeInBytes.W)))
for (i <- 0 until maxWords) {
val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
// note: ANDing against addrMask is to conform to active byte lanes requirements
// if aligning to LSB suffices, we should add the bitwise AND back
m && ((req.address(config.maxCoalLogSize - 1, config.wordWidth)/* & addrMask*/) === i.U)
}
// TODO: SW uses priority encoder, not sure about behavior of MuxCase
data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) =>
s -> req.data
})
mask(i) := MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) =>
s -> req.mask
})
}
val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds)))
sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created
sourceGen.io.reclaim.valid := false.B // not used
sourceGen.io.reclaim.bits := DontCare // not used
val coalesceValid = chosenValid && sourceGen.io.id.valid
io.coalReq.bits.source := sourceGen.io.id.bits
io.coalReq.bits.mask := mask.asUInt
io.coalReq.bits.data := data.asUInt
io.coalReq.bits.size := chosenSize
io.coalReq.bits.address := chosenBundle.baseAddr
io.coalReq.bits.op := io.window.elts(chosenBundle.leaderIdx).head.op
io.coalReq.valid := coalesceValid
io.invalidate.bits := chosenBundle.matchOH
io.invalidate.valid := io.coalReq.fire // invalidate only when fire
io.coalescable := coalescers.map(_.io.results.canCoalesce.asUInt).reduce(_ | _).asBools
dontTouch(io.invalidate) // debug
def disable = {
io.coalReq.valid := false.B
io.invalidate.valid := false.B
io.coalescable.foreach { _ := false.B }
}
if (!config.enable) disable
}
class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) {
require(outer.cpuNode.in.length == config.numLanes,
s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " +
s"config.numLanes (${config.numLanes})")
require(outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " +
s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})")
require(outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
s"mismatch with config.addressWidth (${config.addressWidth})")
val sourceWidth = outer.cpuNode.in.head._1.params.sourceBits
// note we are using word size. assuming all coalescer inputs are word sized
val reqQueueEntryT = new ReqQueueEntry(sourceWidth, config.wordWidth, config.addressWidth, config.wordSizeInBytes)
val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config))
val coalReqT = new ReqQueueEntry(log2Ceil(config.numNewSrcIds), log2Ceil(config.maxCoalLogSize),
config.addressWidth, config.maxCoalLogSize)
val coalescer = Module(new MultiCoalescer(reqQueues, coalReqT, config))
coalescer.io.window := reqQueues.io
reqQueues.io.coalescable := coalescer.io.coalescable
reqQueues.io.invalidate := coalescer.io.invalidate
// ===========================================================================
// Request flow
// ===========================================================================
//
// Override IdentityNode implementation so that we can instantiate
// queues between input and output edges to buffer requests and responses.
// See IdentityNode definition in `diplomacy/Nodes.scala`.
//
(outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
case (((tlIn, _), (tlOut, edgeOut)), lane) =>
// Request queue
val req = Wire(reqQueueEntryT)
req.op := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
req.source := tlIn.a.bits.source
req.address := tlIn.a.bits.address
req.data := tlIn.a.bits.data
req.size := tlIn.a.bits.size
// FIXME: req.data is still containing TL-aligned data. This is fine if
// we're simply passing through this data out the other end, but not if
// the outgoing TL edge (tlOut) has different data width from the incoming
// edge (tlIn). Possible TODO to only store the relevant portion of the
// data, at the cost of re-aligning at the outgoing end.
req.mask := tlIn.a.bits.mask
val enq = reqQueues.io.queue.enq(lane)
val deq = reqQueues.io.queue.deq(lane)
enq.valid := tlIn.a.valid
enq.bits := req
// TODO: deq.ready should respect downstream arbiter
deq.ready := true.B
// Stall upstream core or memtrace driver when shiftqueue is not ready
tlIn.a.ready := enq.ready
tlOut.a.valid := deq.valid
tlOut.a.bits := deq.bits.toTLA(edgeOut)
// debug
// when (tlIn.a.valid) {
// TLPrintf(s"tlIn(${lane}).a",
// tlIn.a.bits.address,
// tlIn.a.bits.size,
// tlIn.a.bits.mask,
// TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode),
// tlIn.a.bits.data,
// 0.U
// )
// }
// when (tlOut.a.valid) {
// TLPrintf(s"tlOut(${lane}).a",
// tlOut.a.bits.address,
// tlOut.a.bits.size,
// tlOut.a.bits.mask,
// TLUtils.AOpcodeIsStore(tlOut.a.bits.opcode),
// tlOut.a.bits.data,
// 0.U
// )
// }
}
val (tlCoal, edgeCoal) = outer.coalescerNode.out.head
tlCoal.a.valid := coalescer.io.coalReq.valid
tlCoal.a.bits := coalescer.io.coalReq.bits.toTLA(edgeCoal)
coalescer.io.coalReq.ready := tlCoal.a.ready
tlCoal.b.ready := true.B
tlCoal.c.valid := false.B
// tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
tlCoal.e.valid := false.B
// ===========================================================================
// Response flow
// ===========================================================================
//
// Connect uncoalescer output and noncoalesced response ports to the response
// queues.
// The maximum number of requests from a single lane that can go into a
// coalesced request. Upper bound is min(DEPTH, 2**sourceWidth).
val numPerLaneReqs = config.queueDepth
val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize), config.maxCoalLogSize)
val respQueues = Seq.tabulate(config.numLanes) { _ =>
Module(
new MultiPortQueue(
respQueueEntryT,
// enq_lanes = 1 + M, where 1 is the response for the original per-lane
// requests that didn't get coalesced, and M is the maximum number of
// single-lane requests that can go into a coalesced request.
// (`numPerLaneReqs`).
// TODO: potentially expensive, because this generates more FFs.
// Rather than enqueueing all responses in a single cycle, consider
// enqueueing one by one (at the cost of possibly stalling downstream).
1 + numPerLaneReqs,
// deq_lanes = 1 because we're serializing all responses to 1 port that
// goes back to the core.
1,
// lanes. Has to be at least max(enq_lanes, deq_lanes)
1 + numPerLaneReqs,
// Depth of each lane queue.
// XXX queue depth is set to an arbitrarily high value that doesn't
// make queue block up in the middle of the simulation. Ideally there
// should be a more logical way to set this, or we should handle
// response queue blocking.
config.respQueueDepth
)
)
}
val respQueueNoncoalPort = 0
val respQueueUncoalPortOffset = 1
(outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
case (((tlIn, edgeIn), (tlOut, _)), lane) =>
// Response queue
//
// This queue will serialize non-coalesced responses along with
// coalesced responses and serve them back to the core side.
val respQueue = respQueues(lane)
val resp = Wire(respQueueEntryT)
resp.fromTLD(tlOut.d.bits)
// Queue up responses that didn't get coalesced originally ("noncoalesced" responses).
// Coalesced (but uncoalesced back) responses will also be enqueued into the same queue.
assert(
respQueue.io.enq(respQueueNoncoalPort).ready,
"respQueue: enq port for noncoalesced response is blocked"
)
respQueue.io.enq(respQueueNoncoalPort).valid := tlOut.d.valid
respQueue.io.enq(respQueueNoncoalPort).bits := resp
// TODO: deq.ready should respect upstream ready
respQueue.io.deq(respQueueNoncoalPort).ready := true.B
tlIn.d.valid := respQueue.io.deq(respQueueNoncoalPort).valid
tlIn.d.bits := respQueue.io.deq(respQueueNoncoalPort).bits.toTLD(edgeIn)
// Debug only
val inflightCounter = RegInit(UInt(32.W), 0.U)
when(tlOut.a.valid) {
// don't inc/dec on simultaneous req/resp
when(!tlOut.d.valid) {
inflightCounter := inflightCounter + 1.U
}
}.elsewhen(tlOut.d.valid) {
inflightCounter := inflightCounter - 1.U
}
dontTouch(inflightCounter)
dontTouch(tlIn.a)
dontTouch(tlIn.d)
dontTouch(tlOut.a)
dontTouch(tlOut.d)
}
// Construct new entry for the inflight table
// FIXME: don't instantiate inflight table entry type here. It leaks the table's impl
// detail to the coalescer
// richard: I think a good idea is to pass Valid[ReqQueueEntry] generated by
// the coalescer directly into the uncoalescer, so that we can offload the
// logic to generate the Inflight Entry into the uncoalescer, where it should be.
// this also reduces top level clutter.
val uncoalescer = Module(new Uncoalescer(config))
val newEntry = Wire(uncoalescer.inflightTable.entryT)
newEntry.source := coalescer.io.coalReq.bits.source
assert (config.maxCoalLogSize <= config.dataBusWidth,
"multi-beat coalesced reads/writes are currently not supported")
assert (
tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
+ s" (${(1 << config.dataBusWidth) * 8})"
)
val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
// Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
// coalescer to every (numLanes * queueDepth) entry in the inflight table.
(newEntry.lanes zip coalescer.io.invalidate.bits).zipWithIndex
.foreach { case ((laneEntry, laneInv), lane) =>
(laneEntry.reqs zip laneInv.asBools).zipWithIndex
.foreach { case ((reqEntry, inv), i) =>
val req = reqQueues.io.elts(lane)(i)
when ((coalescer.io.invalidate.valid && inv)) {
printf(s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n", req.source)
}
reqEntry.valid := (coalescer.io.invalidate.valid && inv)
reqEntry.source := req.source
reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordWidth)
reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
// TODO: load/store op
}
}
dontTouch(newEntry)
uncoalescer.io.coalReqValid := coalescer.io.coalReq.valid
uncoalescer.io.newEntry := newEntry
// Cleanup: custom <>?
uncoalescer.io.coalResp.valid := tlCoal.d.valid
uncoalescer.io.coalResp.bits.source := tlCoal.d.bits.source
uncoalescer.io.coalResp.bits.data := tlCoal.d.bits.data
tlCoal.d.ready := uncoalescer.io.coalResp.ready
// Connect uncoalescer results back into each lane's response queue
(respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) =>
perLaneResps.zipWithIndex.foreach { case (resp, i) =>
// TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
// cache. This should ideally not happen though.
assert(
q.io.enq(respQueueUncoalPortOffset + i).ready,
s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
)
q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
// debug
// when (resp.valid) {
// printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
// }
// dontTouch(q.io.enq(respQueueCoalPortOffset))
}
}
// Debug
dontTouch(coalescer.io.coalReq)
val coalRespData = tlCoal.d.bits.data
dontTouch(coalRespData)
dontTouch(tlCoal.a)
dontTouch(tlCoal.d)
}
// Protocol-agnostic bundle that represents a coalesced response.
//
// Having this makes it easier to:
// * do unit tests -- no need to deal with TileLink in the chiseltest code
// * adapt coalescer to custom protocols like a custom L1 cache interface.
//
// FIXME: overlaps with RespQueueEntry. Trait-ify
class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle {
val source = UInt(log2Ceil(config.numNewSrcIds).W)
val data = UInt((8 * (1 << config.maxCoalLogSize)).W)
def fromTLD(bundle:TLBundleD): Unit = {
this.source := bundle.source
this.data := bundle.data
}
}
class Uncoalescer(config: CoalescerConfig) extends Module {
// notes to hansung:
// val numLanes: Int, <-> config.NUM_LANES
// val numPerLaneReqs: Int, <-> config.DEPTH
// val sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)
// val sizeWidth: Int, <-> config.sizeEnum.width
// val coalDataWidth: Int, <-> (1 << config.MAX_SIZE)
// val numInflightCoalRequests: Int <-> config.NUM_NEW_IDS
val inflightTable = Module(new InflightCoalReqTable(config))
val io = IO(new Bundle {
val coalReqValid = Input(Bool())
// FIXME: receive ReqQueueEntry and construct newEntry inside uncoalescer
val newEntry = Input(inflightTable.entryT.cloneType)
val coalResp = Flipped(Decoupled(new CoalescedResponseBundle(config)))
val uncoalResps = Output(
Vec(
config.numLanes,
Vec(
config.queueDepth,
ValidIO(
new RespQueueEntry(log2Ceil(config.numOldSrcIds), config.wordWidth, config.wordSizeInBytes)
)
)
)
)
})
// Populate inflight table
inflightTable.io.enq.valid := io.coalReqValid
inflightTable.io.enq.bits := io.newEntry
// Look up the table with incoming coalesced responses
inflightTable.io.lookup.ready := io.coalResp.valid
inflightTable.io.lookupSourceId := io.coalResp.bits.source
io.coalResp.ready := true.B // FIXME, see sw model implementation
assert(
!((io.coalReqValid === true.B) && (io.coalResp.valid === true.B) &&
(io.newEntry.source === io.coalResp.bits.source)),
"inflight table: enqueueing and looking up the same srcId at the same cycle is not handled"
)
// Un-coalescing logic
//
def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, logSize: UInt): UInt = {
assert(logSize === 2.U, "currently only supporting 4-byte accesses. TODO")
// sizeInBits should be simulation-only construct
val sizeInBits = ((1.U << logSize) << 3.U).asUInt
assert(
(dataWidth > 0).B && (dataWidth.U % sizeInBits === 0.U),
s"coalesced data width ($dataWidth) not evenly divisible by core req size ($sizeInBits)"
)
val numChunks = dataWidth / 32
val chunks = Wire(Vec(numChunks, UInt(32.W)))
val offsets = (0 until numChunks)
(chunks zip offsets).foreach { case (c, o) =>
// FIXME: whether to take the offset from MSB or LSB depends on
// endianness. Right now we're assuming little endian
c := data(32 * (o + 1) - 1, 32 * o)
// If taking from MSB:
// c := (data >> (dataWidth - (o + 1) * 32)) & sizeMask
}
chunks(offset) // MUX
}
// Un-coalesce responses back to individual lanes
val found = inflightTable.io.lookup.bits
(found.lanes zip io.uncoalResps).foreach { case (perLane, ioPerLane) =>
perLane.reqs.zipWithIndex.foreach { case (oldReq, depth) =>
val ioOldReq = ioPerLane(depth)
// TODO: spatial-only coalescing: only looking at 0th srcId entry
ioOldReq.valid := false.B
ioOldReq.bits := DontCare
when(inflightTable.io.lookup.valid && oldReq.valid) {
ioOldReq.valid := oldReq.valid
ioOldReq.bits.source := oldReq.source
val logSize = found.sizeEnumT.enumToLogSize(oldReq.sizeEnum)
ioOldReq.bits.size := logSize
ioOldReq.bits.data :=
getCoalescedDataChunk(
io.coalResp.bits.data,
io.coalResp.bits.data.getWidth,
oldReq.offset,
logSize
)
}
}
}
}
// InflightCoalReqTable is a table structure that records
// for each unanswered coalesced request which lane the request originated
// from, what their original TileLink sourceId were, etc. We use this info to
// split the coalesced response back to individual per-lane responses with the
// right metadata.
class InflightCoalReqTable(config: CoalescerConfig) extends Module {
val offsetBits = config.maxCoalLogSize - config.wordWidth // assumes word offset
val entryT = new InflightCoalReqTableEntry(
config.numLanes,
config.queueDepth,
log2Ceil(config.numOldSrcIds),
config.maxCoalLogSize,
config.sizeEnum
)
val entries = config.numNewSrcIds
val sourceWidth = log2Ceil(config.numOldSrcIds)
println(s"=========== table sourceWidth: ${sourceWidth}")
println(s"=========== table offsetBits: ${offsetBits}")
println(s"=========== table sizeEnumBits: ${entryT.sizeEnumT.getWidth}")
val io = IO(new Bundle {
val enq = Flipped(Decoupled(entryT))
// TODO: return actual stuff
val lookup = Decoupled(entryT)
// TODO: put this inside decoupledIO
val lookupSourceId = Input(UInt(sourceWidth.W))
})
val table = Mem(
entries,
new Bundle {
val valid = Bool()
val bits = entryT.cloneType
}
)
when(reset.asBool) {
(0 until entries).foreach { i =>
table(i).valid := false.B
table(i).bits.lanes.foreach { l =>
l.reqs.foreach { r =>
r.valid := false.B
r.source := 0.U
r.offset := 0.U
r.sizeEnum := config.sizeEnum.INVALID
}
}
}
}
val full = Wire(Bool())
full := (0 until entries).map( table(_).valid ).reduce( _ && _ )
assert(!full, "inflight table is full and blocking coalescer")
dontTouch(full)
// Enqueue logic
io.enq.ready := !full
val enqFire = io.enq.ready && io.enq.valid
when(enqFire) {
// TODO: handle enqueueing and looking up the same entry in the same cycle?
val entryToWrite = table(io.enq.bits.source)
assert(
!entryToWrite.valid,
"tried to enqueue to an already occupied entry"
)
entryToWrite.valid := true.B
entryToWrite.bits := io.enq.bits
}
// Lookup logic
io.lookup.valid := table(io.lookupSourceId).valid
io.lookup.bits := table(io.lookupSourceId).bits
val lookupFire = io.lookup.ready && io.lookup.valid
// Dequeue as soon as lookup succeeds
when(lookupFire) {
table(io.lookupSourceId).valid := false.B
}
dontTouch(io.lookup)
}
class InflightCoalReqTableEntry(
val numLanes: Int,
// Maximum number of requests from a single lane that can get coalesced into a single request
val numPerLaneReqs: Int,
val sourceWidth: Int,
val offsetBits: Int,
val sizeEnumT: InFlightTableSizeEnum
) extends Bundle {
class PerCoreReq extends Bundle {
val valid = Bool() // FIXME: delete this
// FIXME: oldId and newId shares the same width
val source = UInt(sourceWidth.W)
val offset = UInt(offsetBits.W)
val sizeEnum = sizeEnumT()
}
class PerLane extends Bundle {
val reqs = Vec(numPerLaneReqs, new PerCoreReq)
}
// sourceId of the coalesced response that just came back. This will be the
// key that queries the table.
val source = UInt(sourceWidth.W)
val lanes = Vec(numLanes, new PerLane)
}
object TLUtils {
def AOpcodeIsStore(opcode: UInt): Bool = {
// 0: PutFullData, 1: PutPartialData, 4: Get
assert(
opcode === TLMessages.PutFullData || opcode === TLMessages.Get,
"unhandled TL A opcode found"
)
Mux(opcode === TLMessages.PutFullData, true.B, false.B)
}
def DOpcodeIsStore(opcode: UInt): Bool = {
assert(
opcode === TLMessages.AccessAck || opcode === TLMessages.AccessAckData,
"unhandled TL D opcode found"
)
Mux(opcode === TLMessages.AccessAck, true.B, false.B)
}
}
// `traceHasSource` is true if the input trace file has an additional source
// ID column. This is useful for using the output trace file genereated by
// MemTraceLogger as the driver.
class MemTraceDriver(config: CoalescerConfig, filename: String, traceHasSource: Boolean = false)
(implicit p: Parameters) extends LazyModule {
// Create N client nodes together
val laneNodes = Seq.tabulate(config.numLanes) { i =>
val clientParam = Seq(
TLMasterParameters.v1(
name = "MemTraceDriver" + i.toString,
sourceId = IdRange(0, config.numOldSrcIds)
// visibility = Seq(AddressSet(0x0000, 0xffffff))
)
)
TLClientNode(Seq(TLMasterPortParameters.v1(clientParam)))
}
// Combine N outgoing client node into 1 idenity node for diplomatic
// connection.
val node = TLIdentityNode()
laneNodes.foreach { l => node := l }
lazy val module = new MemTraceDriverImp(this, config, filename, traceHasSource)
}
trait HasTraceLine {
val valid: UInt
val source: UInt
val address: UInt
val is_store: UInt
val size: UInt
val data: UInt
}
// Used for both request and response. Response had address set to 0
// NOTE: these widths have to agree with what's hardcoded in Verilog.
class TraceLine extends Bundle with HasTraceLine {
val valid = Bool()
val source = UInt(32.W)
val address = UInt(64.W) // FIXME: in Verilog this is the same as data width
val is_store = Bool()
val size = UInt(8.W) // this is log2(bytesize) as in TL A bundle
val data = UInt(64.W)
}
class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, filename: String,
traceHasSource: Boolean)
extends LazyModuleImp(outer)
with UnitTestModule {
// Current cycle mark to read from trace
val traceReadCycle = RegInit(1.U(64.W))
// A decoupling queue to handle backpressure from downstream. We let the
// downstream take requests from the queue individually for each lane,
// but do synchronized enqueue whenever all lane queue is ready to prevent
// drifts between the lane.
val reqQueues = Seq.fill(config.numLanes)(Module(new Queue(new TraceLine, 2)))
// Are we safe to read the next warp?
val reqQueueAllReady = reqQueues.map(_.io.enq.ready).reduce(_ && _)
val sim = Module(new SimMemTrace(filename, config.numLanes, traceHasSource))
sim.io.clock := clock
sim.io.reset := reset.asBool
// 'sim.io.trace_ready.ready' is a ready signal going into the DPI sim,
// indicating this Chisel module is ready to read the next line.
sim.io.trace_read.ready := reqQueueAllReady
sim.io.trace_read.cycle := traceReadCycle
// Read output from Verilog BlackBox
// Split output of SimMemTrace, which is flattened across all lanes,back to each lane's.
val laneReqs = Wire(Vec(config.numLanes, new TraceLine))
val addrW = laneReqs(0).address.getWidth
val sizeW = laneReqs(0).size.getWidth
val dataW = laneReqs(0).data.getWidth
laneReqs.zipWithIndex.foreach { case (req, i) =>
req.valid := sim.io.trace_read.valid(i)
req.source := 0.U // driver trace doesn't contain source id
req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i)
req.is_store := sim.io.trace_read.is_store(i)
req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i)
req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i)
}
// Not all fire because trace cycle has to advance even when there is no valid
// line in the trace.
when (reqQueueAllReady){
traceReadCycle := traceReadCycle + 1.U
}
// Enqueue traces to the request queue
(reqQueues zip laneReqs).foreach { case (reqQ, req) =>
// Synchronized enqueue
reqQ.io.enq.valid := reqQueueAllReady && req.valid
reqQ.io.enq.bits := req // FIXME duplicate valid
}
// Issue here is that Vortex mem range is not within Chipyard Mem range
// In default setting, all mem-req for program data must be within
// 0X80000000 -> 0X90000000
def hashToValidPhyAddr(addr: UInt): UInt = {
Cat(8.U(4.W), addr(27, 0))
}
// Take requests off of the queue and generate TL requests
(outer.laneNodes zip reqQueues).foreach { case (node, reqQ) =>
val (tlOut, edge) = node.out(0)
val req = reqQ.io.deq.bits
// backpressure from downstream propagates into the queue
reqQ.io.deq.ready := tlOut.a.ready
// Core only makes accesses of granularity larger than a word, so we want
// the trace driver to act so as well.
// That means if req.size is smaller than word size, we need to pad data
// with zeros to generate a word-size request, and set mask accordingly.
val offsetInWord = req.address % config.wordSizeInBytes.U
val subword = req.size < log2Ceil(config.wordSizeInBytes).U
// `mask` is currently unused
val mask = Wire(UInt(config.wordSizeInBytes.W))
val wordData = Wire(UInt((config.wordSizeInBytes * 8 * 2).W))
val sizeInBytes = Wire(UInt((sizeW + 1).W))
sizeInBytes := (1.U) << req.size
mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
val wordAlignedSize = Mux(subword, 2.U, req.size)
val sourceGen = Module(new RoundRobinSourceGenerator(log2Ceil(config.numOldSrcIds),
ignoreInUse = false))
sourceGen.io.gen := reqQ.io.deq.fire
// assert(sourceGen.io.id.valid)
val (plegal, pbits) = edge.Put(
fromSource = sourceGen.io.id.bits,
toAddress = hashToValidPhyAddr(wordAlignedAddress),
lgSize = wordAlignedSize, // trace line already holds log2(size)
// data should be aligned to beatBytes
data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
)
val (glegal, gbits) = edge.Get(
fromSource = sourceGen.io.id.bits,
toAddress = hashToValidPhyAddr(wordAlignedAddress),
lgSize = wordAlignedSize
)
val legal = Mux(req.is_store, plegal, glegal)
val bits = Mux(req.is_store, pbits, gbits)
tlOut.a.valid := (reqQ.io.deq.valid && sourceGen.io.id.valid)
when (tlOut.a.valid) {
assert(legal, "illegal TL req gen")
}
tlOut.a.bits := bits
tlOut.b.ready := true.B
tlOut.c.valid := false.B
tlOut.d.ready := true.B
tlOut.e.valid := false.B
// Reclaim source id on response
sourceGen.io.reclaim.valid := tlOut.d.valid
sourceGen.io.reclaim.bits := tlOut.d.bits.source
// debug
when(tlOut.a.valid) {
TLPrintf(
"MemTraceDriver",
tlOut.a.bits.source,
tlOut.a.bits.address,
tlOut.a.bits.size,
tlOut.a.bits.mask,
req.is_store,
tlOut.a.bits.data,
req.data
)
}
dontTouch(tlOut.a)
dontTouch(tlOut.d)
}
// Give some slack time after trace EOF to the downstream system so that we
// make sure to receive all outstanding responses.
val finishCounter = RegInit(200.U(64.W))
when(sim.io.trace_read.finished) {
finishCounter := finishCounter - 1.U
}
io.finished := (finishCounter === 0.U)
when(io.finished) {
assert(
false.B,
"\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)"
)
}
}
class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
extends BlackBox(
Map("FILENAME" -> filename,
"NUM_LANES" -> numLanes,
"HAS_SOURCE" -> (if (traceHasSource) 1 else 0))
)
with HasBlackBoxResource {
val traceLineT = new TraceLine
val addrW = traceLineT.address.getWidth
val sizeW = traceLineT.size.getWidth
val dataW = traceLineT.data.getWidth
val io = IO(new Bundle {
val clock = Input(Clock())
val reset = Input(Bool())
// These names have to match declarations in the Verilog code, eg.
// trace_read_address.
val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source
val ready = Input(Bool())
val valid = Output(UInt(numLanes.W))
// Chisel can't interface with Verilog 2D port, so flatten all lanes into
// single wide 1D array.
// TODO: assumes 64-bit address.
val cycle = Input(UInt(64.W))
val address = Output(UInt((addrW * numLanes).W))
val is_store = Output(UInt(numLanes.W))
val size = Output(UInt((sizeW * numLanes).W))
val data = Output(UInt((dataW * numLanes).W))
val finished = Output(Bool())
}
})
addResource("/vsrc/SimMemTrace.v")
addResource("/csrc/SimMemTrace.cc")
addResource("/csrc/SimMemTrace.h")
}
class MemTraceLogger(
numLanes: Int,
// base filename for the generated trace files. full filename will be
// suffixed depending on `reqEnable`/`respEnable`/`loggerName`.
filename: String,
reqEnable: Boolean = true,
respEnable: Boolean = true,
// filename suffix that is unique to this logger module.
loggerName: String = ".logger"
)(implicit
p: Parameters
) extends LazyModule {
val node = TLIdentityNode()
// val beatBytes = 8 // FIXME: hardcoded
// val node = TLManagerNode(Seq.tabulate(numLanes) { _ =>
// TLSlavePortParameters.v1(
// Seq(
// TLSlaveParameters.v1(
// address = List(AddressSet(0x0000, 0xffffff)), // FIXME: hardcoded
// supportsGet = TransferSizes(1, beatBytes),
// supportsPutPartial = TransferSizes(1, beatBytes),
// supportsPutFull = TransferSizes(1, beatBytes)
// )
// ),
// beatBytes = beatBytes
// )
// })
// Copied from freechips.rocketchip.trailingZeros which only supports Scala
// integers
def trailingZeros(x: UInt): UInt = {
Mux(x === 0.U, x.widthOption.get.U, Log2(x & -x))
}
lazy val module = new Impl
class Impl extends LazyModuleImp(this) {
val io = IO(new Bundle {
val numReqs = Output(UInt(64.W))
val numResps = Output(UInt(64.W))
val reqBytes = Output(UInt(64.W))
val respBytes = Output(UInt(64.W))
})
val numReqs = RegInit(0.U(64.W))
val numResps = RegInit(0.U(64.W))
val reqBytes = RegInit(0.U(64.W))
val respBytes = RegInit(0.U(64.W))
io.numReqs := numReqs
io.numResps := numResps
io.reqBytes := reqBytes
io.respBytes := respBytes
val simReq =
if (reqEnable)
Some(Module(new SimMemTraceLogger(false, s"${filename}.${loggerName}.req", numLanes)))
else None
val simResp =
if (respEnable)
Some(Module(new SimMemTraceLogger(true, s"${filename}.${loggerName}.resp", numLanes)))
else None
if (simReq.isDefined) {
simReq.get.io.clock := clock
simReq.get.io.reset := reset.asBool
}
if (simResp.isDefined) {
simResp.get.io.clock := clock
simResp.get.io.reset := reset.asBool
}
val laneReqs = Wire(Vec(numLanes, new TraceLine))
val laneResps = Wire(Vec(numLanes, new TraceLine))
assert(
numLanes == node.in.length,
"`numLanes` does not match the number of TL edges connected to the MemTraceLogger"
)
// snoop on the TileLink edges to log traffic
((node.in zip node.out) zip (laneReqs zip laneResps)).foreach {
case (((tlIn, _), (tlOut, _)), (req, resp)) =>
tlOut.a <> tlIn.a
tlIn.d <> tlOut.d
// requests on TL A channel
//
// Only log trace when fired, e.g. both upstream and downstream is ready
// and transaction happened.
req.valid := tlIn.a.fire
req.size := tlIn.a.bits.size
req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
req.source := tlIn.a.bits.source
// TL always carries the exact unaligned address that the client
// originally requested, so no postprocessing required
req.address := tlIn.a.bits.address
when(req.valid) {
TLPrintf(
s"MemTraceLogger (${loggerName}:downstream)",
tlIn.a.bits.source,
tlIn.a.bits.address,
tlIn.a.bits.size,
tlIn.a.bits.mask,
req.is_store,
tlIn.a.bits.data,
req.data
)
}
// TL data
//
// When tlIn.a.bits.size is smaller than the data bus width, need to
// figure out which byte lanes we actually accessed so that
// we can write that to the memory trace.
// See Section 4.5 Byte Lanes in spec 1.8.1
// This assert only holds true for PutFullData and not PutPartialData,
// where HIGH bits in the mask may not be contiguous.
when (tlIn.a.valid) {
assert(
PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
"mask HIGH popcount do not match the TL size. " +
"Partial masks are not allowed for PutFull"
)
}
val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
val dataW = tlIn.params.dataBits
val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U))
req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
// when (req.valid) {
// printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data)
// }
// responses on TL D channel
//
// Only log trace when fired, e.g. both upstream and downstream is ready
// and transaction happened.
resp.valid := tlOut.d.fire
resp.size := tlOut.d.bits.size
resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode)
resp.source := tlOut.d.bits.source
// NOTE: TL D channel doesn't carry address nor mask, so there's no easy
// way to figure out which bytes the master actually use. Since we
// don't care too much about addresses in the trace anyway, just store
// the entire bits.
resp.address := 0.U
resp.data := tlOut.d.bits.data
}
// stats
val numReqsThisCycle =
laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
val numRespsThisCycle =
laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 }
val reqBytesThisCycle =
laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
b0 + b1
}
val respBytesThisCycle =
laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) =>
b0 + b1
}
numReqs := numReqs + numReqsThisCycle
numResps := numResps + numRespsThisCycle
reqBytes := reqBytes + reqBytesThisCycle
respBytes := respBytes + respBytesThisCycle
// Flatten per-lane signals to the Verilog blackbox input.
//
// This is a clunky workaround of the fact that Chisel doesn't allow partial
// assignment to a bitfield range of a wide signal.
def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
// these will get optimized out
val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
val vecAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address)))
val vecIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store)))
val vecSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size)))
val vecData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data)))
perLane.zipWithIndex.foreach { case (l, i) =>
vecValid(i) := l.valid
vecSource(i) := l.source
vecAddress(i) := l.address
vecIsStore(i) := l.is_store
vecSize(i) := l.size
vecData(i) := l.data
}
simIO.valid := vecValid.asUInt
simIO.source := vecSource.asUInt
simIO.address := vecAddress.asUInt
simIO.is_store := vecIsStore.asUInt
simIO.size := vecSize.asUInt
simIO.data := vecData.asUInt
}
if (simReq.isDefined) {
flattenTrace(simReq.get.io.trace_log, laneReqs)
assert(
simReq.get.io.trace_log.ready === true.B,
"MemTraceLogger is expected to be always ready"
)
}
if (simResp.isDefined) {
flattenTrace(simResp.get.io.trace_log, laneResps)
assert(
simResp.get.io.trace_log.ready === true.B,
"MemTraceLogger is expected to be always ready"
)
}
}
}
// MemTraceLogger is bidirectional, and `isResponse` is how the DPI module tells
// itself whether it's logging the request stream or the response stream. This
// is necessary because we have to generate slightly different trace format
// depending on this, e.g. response trace will not contain an address column.
class SimMemTraceLogger(isResponse: Boolean, filename: String, numLanes: Int)
extends BlackBox(
Map(
"IS_RESPONSE" -> (if (isResponse) 1 else 0),
"FILENAME" -> filename,
"NUM_LANES" -> numLanes
)
)
with HasBlackBoxResource {
val traceLineT = new TraceLine
val sourceW = traceLineT.source.getWidth
val addrW = traceLineT.address.getWidth
val sizeW = traceLineT.size.getWidth
val dataW = traceLineT.data.getWidth
val io = IO(new Bundle {
val clock = Input(Clock())
val reset = Input(Bool())
val trace_log = new Bundle with HasTraceLine {
val valid = Input(UInt(numLanes.W))
val source = Input(UInt((sourceW * numLanes).W))
// Chisel can't interface with Verilog 2D port, so flatten all lanes into
// single wide 1D array.
// TODO: assumes 64-bit address.
val address = Input(UInt((addrW * numLanes).W))
val is_store = Input(UInt(numLanes.W))
val size = Input(UInt((sizeW * numLanes).W))
val data = Input(UInt((dataW * numLanes).W))
val ready = Output(Bool())
}
})
addResource("/vsrc/SimMemTraceLogger.v")
addResource("/csrc/SimMemTraceLogger.cc")
addResource("/csrc/SimMemTrace.h")
}
class TLPrintf {}
object TLPrintf {
def apply(
printer: String,
source: UInt,
address: UInt,
size: UInt,
mask: UInt,
is_store: Bool,
tlData: UInt,
reqData: UInt
) = {
printf(s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
source, address, size, mask, is_store)
when(is_store) {
printf(", tlData=%x, reqData=%x", tlData, reqData)
}
printf("\n")
}
}
// Synthesizable unit tests
class DummyDriver(config: CoalescerConfig)(implicit p: Parameters)
extends LazyModule {
val laneNodes = Seq.tabulate(config.numLanes) { i =>
val clientParam = Seq(
TLMasterParameters.v1(
name = "dummy-core-node-" + i.toString,
sourceId = IdRange(0, config.numOldSrcIds)
// visibility = Seq(AddressSet(0x0000, 0xffffff))
)
)
TLClientNode(Seq(TLMasterPortParameters.v1(clientParam)))
}
// Combine N outgoing client node into 1 idenity node for diplomatic
// connection.
val node = TLIdentityNode()
laneNodes.foreach { l => node := l }
lazy val module = new DummyDriverImp(this, config)
}
class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
extends LazyModuleImp(outer)
with UnitTestModule {
val sourceIdCounter = RegInit(0.U(log2Ceil(config.numOldSrcIds).W))
sourceIdCounter := sourceIdCounter + 1.U
val finishCounter = RegInit(10000.U(64.W))
finishCounter := finishCounter - 1.U
io.finished := (finishCounter === 0.U)
outer.laneNodes.zipWithIndex.foreach { case (node, lane) =>
assert(node.out.length == 1)
// generate dummy traffic to coalescer to prevent it from being optimized
// out during synthesis
val address = Wire(UInt(config.addressWidth.W))
address := Cat((finishCounter + (lane.U % 3.U)), 0.U(config.wordWidth.W))
val (tl, edge) = node.out(0)
val (legal, bits) = edge.Put(
fromSource = sourceIdCounter,
toAddress = address,
lgSize = 2.U,
data = finishCounter + (lane.U % 3.U)
)
assert(legal, "illegal TL req gen")
tl.a.valid := true.B
tl.a.bits := bits
tl.b.ready := true.B
tl.c.valid := false.B
tl.d.ready := true.B
tl.e.valid := false.B
}
val dataSum = outer.laneNodes.map { node =>
val tl = node.out(0)._1
val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
data
}.reduce (_ +& _)
// this doesn't make much sense, but it prevents the entire uncoalescer from
// being optimized away
finishCounter := finishCounter + dataSum
}
// A dummy harness around the coalescer for use in VLSI flow.
// Should not instantiate any memtrace modules.
class DummyCoalescer(implicit p: Parameters) extends LazyModule {
val numLanes = p(SIMTCoreKey).get.nLanes
println(s"============ numLanes: ${numLanes}")
val config = defaultConfig.copy(numLanes = numLanes)
val driver = LazyModule(new DummyDriver(config))
val rams = Seq.fill(config.numLanes + 1)( // +1 for coalesced edge
LazyModule(
// NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
// edges globally, by way of Diplomacy communicating the TL slave
// parameters to the upstream nodes.
new TLRAM(address = AddressSet(0x0000, 0xffffff),
beatBytes = (1 << config.dataBusWidth))
)
)
val coal = LazyModule(new CoalescingUnit(config))
coal.cpuNode :=* driver.node
rams.foreach(_.node := coal.aggregateNode)
lazy val module = new Impl
class Impl extends LazyModuleImp(this) with UnitTestModule {
io.finished := driver.module.io.finished
}
}
class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
extends UnitTest(timeout) {
val dut = Module(LazyModule(new DummyCoalescer).module)
dut.io.start := io.start
io.finished := dut.io.finished
}
// tracedriver --> coalescer --> tracelogger --> tlram
class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule {
val numLanes = p(SIMTCoreKey).get.nLanes
val config = defaultConfig.copy(numLanes = numLanes)
val driver = LazyModule(new MemTraceDriver(config, filename))
val coreSideLogger = LazyModule(
new MemTraceLogger(numLanes, filename, loggerName = "coreside")
)
val coal = LazyModule(new CoalescingUnit(config))
val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside"))
val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
LazyModule(
// NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
// edges globally, by way of Diplomacy communicating the TL slave
// parameters to the upstream nodes.
new TLRAM(address = AddressSet(0x0000, 0xffffff),
beatBytes = (1 << config.dataBusWidth))
)
)
memSideLogger.node :=* coal.aggregateNode
coal.cpuNode :=* coreSideLogger.node :=* driver.node
rams.foreach { r => r.node := memSideLogger.node }
lazy val module = new Impl
class Impl extends LazyModuleImp(this) with UnitTestModule {
driver.module.io.start := io.start
io.finished := driver.module.io.finished
when(io.finished) {
printf(
"numReqs=%d, numResps=%d, reqBytes=%d, respBytes=%d\n",
coreSideLogger.module.io.numReqs,
coreSideLogger.module.io.numResps,
coreSideLogger.module.io.reqBytes,
coreSideLogger.module.io.respBytes
)
assert(
(coreSideLogger.module.io.numReqs === coreSideLogger.module.io.numResps) &&
(coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes),
"FAIL: requests and responses traffic to the coalescer do not match"
)
printf("SUCCESS: coalescer response traffic matched requests!\n")
}
}
}
class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters)
extends UnitTest(timeout) {
val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
dut.io.start := io.start
io.finished := dut.io.finished
}
// tracedriver --> coalescer --> tlram
class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
// TODO: use parameters for numLanes
val numLanes = 4
val filename = "vecadd.core1.thread4.trace"
val coal = LazyModule(new CoalescingUnit(defaultConfig))
val driver = LazyModule(new MemTraceDriver(defaultConfig, filename))
val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
LazyModule(
// NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
// edges globally, by way of Diplomacy communicating the TL slave
// parameters to the upstream nodes.
new TLRAM(address = AddressSet(0x0000, 0xffffff),
beatBytes = (1 << defaultConfig.dataBusWidth))
)
)
coal.cpuNode :=* driver.node
rams.foreach { r => r.node := coal.aggregateNode }
lazy val module = new Impl
class Impl extends LazyModuleImp(this) with UnitTestModule {
driver.module.io.start := io.start
io.finished := driver.module.io.finished
}
}
class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) {
val dut = Module(LazyModule(new TLRAMCoalescer).module)
dut.io.start := io.start
io.finished := dut.io.finished
}
////////////
////////////
////////////
//////////// Code for CoalArbiter
////////////
////////////
// Lazy Module is needed to instantiate outgoing node
class CoalArbiter(config: CoalescerConfig) (implicit p: Parameters) extends LazyModule {
// Let SIMT's word size be 32, and read/write granularity be 256
val fullSourceIdRange = config.numOldSrcIds * config.numLanes + config.numNewSrcIds * config.numCoalReqs
// K client nodes of edge size 32 for non-coalesced reqs
val nonCoalNarrowNodes = Seq.tabulate(config.numArbiterOutputPorts){ i =>
val nonCoalNarrowParam = Seq(
TLMasterParameters.v1(
name = "NonCoalNarrowNode" + i.toString,
sourceId = IdRange(0, fullSourceIdRange)
)
)
TLClientNode(Seq(TLMasterPortParameters.v1(nonCoalNarrowParam)))
}
// One identity Node for the Noncoalesced Reqest after Width Adaptation
// You can put widget between idenity node and client node (diplomacy)
val nonCoalNode = TLIdentityNode()
nonCoalNarrowNodes.foreach(narrowNode =>
nonCoalNode := TLWidthWidget(config.wordSizeInBytes) := narrowNode
)
// K client nodes of edge size 256 for the coalesced reqs
val coalReqNodes = Seq.tabulate(config.numArbiterOutputPorts){ i =>
val coalParam = Seq(
TLMasterParameters.v1(
name = "CoalReqNode" + i.toString,
sourceId = IdRange(0, fullSourceIdRange)
)
)
TLClientNode(Seq(TLMasterPortParameters.v1(coalParam)))
}
// 1 idenity node for the Coalesced Reqs
val coalNode = TLIdentityNode()
coalReqNodes.foreach(coalReqNode =>
coalNode := coalReqNode
)
//Assertion Section
def isPowerOfTwo(n: Int): Boolean = {
(n > 0) && ((n & (n - 1)) == 0)
}
assert(isPowerOfTwo(config.numOldSrcIds), "Number of old source id must be power of 2")
assert(isPowerOfTwo(config.numNewSrcIds), "Number of new source id must be power of 2")
//Below is for efficient conversion from Global to Local bits
//Also, we should have more source id for coalesced request for better perf
assert(config.numNewSrcIds >= config.numOldSrcIds, "new source id must be equal or greater than old source id")
// 1 Final Output Identity Node
val outputNode = TLIdentityNode()
val nonCoalEntryT = new ReqQueueEntry(
log2Ceil(config.numOldSrcIds),
config.wordWidth,
config.addressWidth,
log2Ceil(config.wordSizeInBytes)
)
val coalEntryT = new ReqQueueEntry(
log2Ceil(config.numOldSrcIds),
log2Ceil(config.maxCoalLogSize),
config.addressWidth,
config.maxCoalLogSize //already log 2
)
val respNonCoalEntryT = new RespQueueEntry(
log2Ceil(config.numOldSrcIds),
config.wordWidth,
log2Ceil(config.wordSizeInBytes)
)
val respCoalBundleT = new CoalescedResponseBundle(config)
lazy val module = new CoalArbiterImpl(
this, config, nonCoalEntryT, coalEntryT, respNonCoalEntryT, respCoalBundleT)
}
class CoalArbiterImpl(outer: CoalArbiter,
config: CoalescerConfig,
nonCoalEntryT: ReqQueueEntry,
coalEntryT: ReqQueueEntry,
respNonCoalEntryT: RespQueueEntry,
respCoalBundleT: CoalescedResponseBundle
) extends LazyModuleImp(outer){
val io = IO(new Bundle {
val nonCoalReqs = Vec(config.numLanes, Flipped(Decoupled(nonCoalEntryT)))
val coalReqs = Vec(config.numCoalReqs, Flipped(Decoupled(coalEntryT)))
val nonCoalResps = Vec(config.numLanes, Decoupled(respNonCoalEntryT))
val coalResp = Decoupled(respCoalBundleT)
}
)
//Helper Class & Method Section
//Provide an simple decoupled interface between bundle of 2 different type
class ConverterTunnel[T <: Data, U <: Data](
genA: T,
genB: U,
conversionFn: T => U
) extends Module {
val io = IO(new Bundle {
val in = Flipped(Decoupled(genA.cloneType))
val out = Decoupled(genB.cloneType)
})
io.in.ready := io.out.ready
io.out.valid := io.in.valid
io.out.bits := conversionFn(io.in.bits)
}
def canHitBank(addr: UInt, bankNum: UInt) : Bool = {
val byteOffset = 3
val bankBase = log2Ceil(config.bankStrideInBytes)
val bankOffset = log2Ceil(config.numArbiterOutputPorts)
(addr(bankBase+bankOffset-byteOffset, bankBase - byteOffset) === bankNum)
}
//This Operation Could be Expensive
def toGlobalSourceId(isCoalReq : Bool, laneIdx : UInt, sourceID : UInt) : UInt = {
val gid = Mux(isCoalReq,
config.numNewSrcIds.U * laneIdx + sourceID,
config.numOldSrcIds.U * laneIdx + sourceID + config.numNewSrcIds.U * config.numCoalReqs.U
)
gid
}
//All the ids are power of 2, so we can just look at bottom bits
def toLocalSourceId(isCoalReq : Bool, sourceID : UInt) : UInt = {
val sid = Mux(isCoalReq,
sourceID(log2Ceil(config.numNewSrcIds)-1, 0),
sourceID(log2Ceil(config.numOldSrcIds)-1, 0)
)
sid
}
def belongsToLane(laneIdx: UInt, gid: UInt) : Bool = {
val base = config.numNewSrcIds.U * config.numCoalReqs.U
((gid >= base + config.numOldSrcIds.U * laneIdx) &&
(gid < base + config.numOldSrcIds.U * (laneIdx+1.U)))
}
def isCoalReq(gid : UInt) : Bool = {
gid <= config.numNewSrcIds.U * config.numCoalReqs.U
}
//
val fullSourceIdRange = config.numOldSrcIds * config.numLanes + config.numNewSrcIds * config.numCoalReqs
val nonCoalGiDEntryT = new ReqQueueEntry(
log2Ceil(fullSourceIdRange),
config.wordWidth,
config.addressWidth,
log2Ceil(config.wordSizeInBytes)
)
val coalGiDEntryT = new ReqQueueEntry(
log2Ceil(fullSourceIdRange),
log2Ceil(config.maxCoalLogSize),
config.addressWidth,
config.maxCoalLogSize //already log 2
)
// Before either a coalesced or non coalesced request enter RR arbiter
// It needs to turn its source into global source id
// Unfortunately this involves extending the width of sourceid field, and a new bundle must be created
// This is a higher order function
def reqEntry2GidReqFn(laneIndex : UInt, reqEntryT : ReqQueueEntry, isCoalReq : Bool) : ReqQueueEntry => ReqQueueEntry = {
def func(lid_req : ReqQueueEntry) : ReqQueueEntry = {
val gid_req = reqEntryT.cloneType
gid_req <> lid_req
gid_req.source := toGlobalSourceId(isCoalReq, laneIndex, lid_req.source)
gid_req
}
func
}
def reqEntry2TLAFn(edgeOut: TLEdgeOut) : ReqQueueEntry => TLBundleA = {
def func(gid_req : ReqQueueEntry) : TLBundleA = {
gid_req.toTLA(edgeOut)
}
func
}
def tlD2respEntryFn() : TLBundleD => RespQueueEntry = {
def func(bundle: TLBundleD) : RespQueueEntry = {
val resp = Wire(respNonCoalEntryT)
resp.fromTLD(bundle)
resp.source := toLocalSourceId(false.B, bundle.source)
resp
}
func
}
def tlD2CoalBundleFn() : TLBundleD => CoalescedResponseBundle = {
def func(bundle: TLBundleD) : CoalescedResponseBundle = {
val coalbundle = Wire(respCoalBundleT)
coalbundle.fromTLD(bundle)
coalbundle.source := toLocalSourceId(true.B, bundle.source)
coalbundle
}
func
}
/////////////////////////////////////////////////////
//HDL Implementation Section
/////////////////////////////////////////////////////
//Stage 1: Create Queue for nonCoalReqs and CoalReqs
val nonCoalReqsQueues = Seq.tabulate(config.numLanes){_=>
Module(new Queue(nonCoalEntryT.cloneType, 1, true, false))
}
val coalReqsQueues = Seq.tabulate(config.numCoalReqs){_=>
Module(new Queue(coalEntryT.cloneType, 1, true, false))
}
//Stage 1a: connect two Queue groups to the input
(io.nonCoalReqs zip nonCoalReqsQueues).foreach{
case (req, q) => q.io.enq <> req
}
(io.coalReqs zip coalReqsQueues).foreach{
case (req, q) => q.io.enq <> req
}
//Stage 1b: connect output of Queues to the RR arbiters (each arbiter is for a unique bank)
// the two loops below could be merged into one loop, but separated for readability
val nonCoalRRArbiters = Seq.tabulate(config.numArbiterOutputPorts){_=>
Module(new RRArbiter(nonCoalGiDEntryT.cloneType, config.numLanes))
}
nonCoalReqsQueues.zipWithIndex.foreach{ case(q, q_idx) =>
nonCoalRRArbiters.zipWithIndex.foreach{ case(arb, arb_idx) =>
val nonCoal2gidFunc = reqEntry2GidReqFn(q_idx.U, nonCoalGiDEntryT, false.B)
val nonCoalRRArbTunnel = Module(new ConverterTunnel(
nonCoalEntryT.cloneType,
nonCoalGiDEntryT.cloneType,
nonCoal2gidFunc)
)
nonCoalRRArbTunnel.io.in <> q.io.deq
arb.io.in(q_idx) <> nonCoalRRArbTunnel.io.out
//OverWrite Valid base on if we can actually hit this bank
arb.io.in(q_idx).valid := canHitBank(nonCoalRRArbTunnel.io.out.bits.address, arb_idx.U) &&
nonCoalRRArbTunnel.io.out.valid
}
}
val coalRRArbiters = Seq.tabulate(config.numArbiterOutputPorts){_=>
Module(new RRArbiter(coalGiDEntryT.cloneType, config.numCoalReqs))
}
coalReqsQueues.zipWithIndex.foreach{ case(q, q_idx) =>
coalRRArbiters.zipWithIndex.foreach{ case(arb, arb_idx) =>
val coal2gidFunc = reqEntry2GidReqFn(q_idx.U, coalGiDEntryT, true.B)
val coalRRArbTunnel = Module(new ConverterTunnel(
coalEntryT.cloneType,
coalGiDEntryT.cloneType,
coal2gidFunc)
)
coalRRArbTunnel.io.in <> q.io.deq
arb.io.in(q_idx) <> coalRRArbTunnel.io.out
//OverWrite Valid
arb.io.in(q_idx).valid := canHitBank(coalRRArbTunnel.io.out.bits.address, arb_idx.U) &&
coalRRArbTunnel.io.out.valid
}
}
//Stage 2, Connect the output of Arbiters to respective nonCoal node
// Concatenate the nodes , concatenates the arbiters, and zip them together, then loop
// the reqEntry2TLA will generate different TLA bundle depending on if the Req is coal or non coal
((outer.nonCoalNarrowNodes++outer.coalReqNodes) zip
(nonCoalRRArbiters++coalRRArbiters)).foreach{
case (node, arb) =>
val (tlOut, edgeOut) = node.out(0)
val coal2TLAFunc = reqEntry2TLAFn(edgeOut)
val nonCoalTLATunnel = Module(new ConverterTunnel(
arb.io.out.bits.cloneType,
tlOut.a.bits.cloneType,
coal2TLAFunc
)
)
nonCoalTLATunnel.io.in <> arb.io.out
tlOut.a <> nonCoalTLATunnel.io.out
}
//Stage 3, Make the Idenity node pass through channel A
// Connect the K edges Identity Node to PO arbiter
// noncoalesced to port 1, coalesced to port 0
val priorityArbs = Seq.tabulate(config.numArbiterOutputPorts){_=>
Module(new Arbiter(outer.outputNode.out(0)._1.a.bits.cloneType, 2))
}
//Make both Idenity node Pass Through Channel A, for both Coal and NonCoal
((outer.nonCoalNode.out ++ outer.coalNode.out) zip
(outer.nonCoalNode.in ++ outer.coalNode.in)).foreach{
case ((tlOut,_),(tlIn,_)) =>
tlOut.a <> tlIn.a
}
//Connection to PO Arbiters
((outer.nonCoalNode.out zip outer.coalNode.out) zip priorityArbs).foreach{
case (((nonCoalOut, _),(coalOut, _)), arb) =>
arb.io.in(1) <> nonCoalOut.a
arb.io.in(0) <> coalOut.a
}
//Stage 4, Connect PO arbiter to each edge of output Node
//And make idenitity node passs through the inputs
((outer.outputNode.in zip outer.outputNode.out) zip priorityArbs).foreach{
case (((tlIn, _), (tlOut, _)), arb) =>
tlOut.a <> tlIn.a
tlIn.a <> arb.io.out
}
////////////////
// Incoming Data Handling
//Stage 1, Forward data from output node to the Idenity node of Coal and NonCoal
// while setting the correct valid signal to base on if the request is Coalesced or not
((outer.outputNode.in zip outer.outputNode.out) zip
(outer.nonCoalNode.out zip outer.coalNode.out)).foreach{
case( ((tlIn, _),(tlOut, _)), ((nonCoalOut, _),(coalOut, _)) ) =>
tlIn.d <> tlOut.d
nonCoalOut.d <> tlIn.d
coalOut.d <> tlIn.d
//rewrite valid signal
nonCoalOut.d.valid := !isCoalReq(tlIn.d.bits.source) && tlIn.d.valid
coalOut.d.valid := isCoalReq(tlIn.d.bits.source) && tlIn.d.valid
}
//Stage 2, Make both Idenity node Pass Through Channel D, for both Coal and NonCoal
//
((outer.nonCoalNode.out ++ outer.coalNode.out) zip
(outer.nonCoalNode.in ++ outer.coalNode.in)).foreach{
case ((tlOut,_),(tlIn,_)) =>
tlIn.d <> tlOut.d
}
//Stage 3, Connect the channel D of nonCoalNodes to the perLane arbiters
//Stage 3a, connect the noncoalesced edge to every single perlane arbiter
val perLaneRespRRArbs = Seq.tabulate(config.numLanes){_=>
Module(new RRArbiter(respNonCoalEntryT.cloneType, config.numArbiterOutputPorts))
}
outer.nonCoalNarrowNodes.zipWithIndex.foreach{
case (node, node_idx) =>
val (tlOut, edgeOut) = node.out(0)
perLaneRespRRArbs.zipWithIndex.foreach{
case(arb, arb_idx) =>
val tlD2RespEntryFunc = tlD2respEntryFn()
val perLaneArbTunnel = Module(new ConverterTunnel(
tlOut.d.bits.cloneType,
arb.io.in(0).bits.cloneType,
tlD2RespEntryFunc
)
)
perLaneArbTunnel.io.in <> tlOut.d
arb.io.in(node_idx) <> perLaneArbTunnel.io.out
//rewrite valid base on if source id actually belongs to this lane
arb.io.in(node_idx).valid := belongsToLane(arb_idx.U, perLaneArbTunnel.io.out.bits.source) &&
perLaneArbTunnel.io.out.valid
}
}
//Stage 3b, connect coalesced request to
val coalBundleRRArbiter = Module(new RRArbiter(respCoalBundleT.cloneType, config.numArbiterOutputPorts))
outer.coalReqNodes.zipWithIndex.foreach{
case(node, node_idx) =>
val (tlOut, edgeOut) = node.out(0)
val tlD2CoalBundleFunc = tlD2CoalBundleFn()
val coalBundleArbTunnel = Module(new ConverterTunnel(
tlOut.d.bits.cloneType,
coalBundleRRArbiter.io.in(0).bits.cloneType,
tlD2CoalBundleFunc
)
)
coalBundleArbTunnel.io.in <> tlOut.d
coalBundleRRArbiter.io.in(node_idx) <> coalBundleArbTunnel.io.out
}
//Connect 4, Connect the arbiters to output
// connect the noncoalesced vector
(perLaneRespRRArbs zip io.nonCoalResps).foreach{
case (arb, resp) =>
resp <> arb.io.out
}
// connect the coalesced bundle
io.coalResp <> coalBundleRRArbiter.io.out
}