Now do proper sourcegen for the tlCoal edge that's coming out of the coalescer manager node. This also prevents inflight table from being full. This means we move setting source ID of coalReq to outside the coalescer, because sourceGen needs looking into response bits as well, which is easier to do outside coalescer at the toplevel. FIXME: coalescer unit test is still broken.
2115 lines
74 KiB
Scala
2115 lines
74 KiB
Scala
// See LICENSE.SiFive for license details.
|
|
|
|
package freechips.rocketchip.tilelink
|
|
|
|
import chisel3._
|
|
import chisel3.util._
|
|
import org.chipsalliance.cde.config.{Parameters, Field}
|
|
import freechips.rocketchip.diplomacy._
|
|
// import freechips.rocketchip.devices.tilelink.TLTestRAM
|
|
import freechips.rocketchip.util.MultiPortQueue
|
|
import freechips.rocketchip.unittest._
|
|
|
|
// TODO: find better place for these
|
|
case class SIMTCoreParams(nLanes: Int = 4)
|
|
case class MemtraceCoreParams(tracefilename: String = "undefined", traceHasSource: Boolean = false)
|
|
|
|
case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/)
|
|
case object MemtraceCoreKey extends Field[Option[MemtraceCoreParams]](None /*default*/)
|
|
case object CoalescerKey extends Field[Option[CoalescerConfig]](None /*default*/)
|
|
|
|
trait InFlightTableSizeEnum extends ChiselEnum {
|
|
val INVALID: Type
|
|
val FOUR: Type
|
|
def logSizeToEnum(x: UInt): Type
|
|
def enumToLogSize(x: Type): UInt
|
|
}
|
|
|
|
object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum {
|
|
val INVALID = Value(0.U)
|
|
val FOUR = Value(1.U)
|
|
|
|
def logSizeToEnum(x: UInt): Type = {
|
|
MuxCase(INVALID, Seq(
|
|
(x === 2.U) -> FOUR
|
|
))
|
|
}
|
|
|
|
def enumToLogSize(x: Type): UInt = {
|
|
MuxCase(0.U, Seq(
|
|
(x === FOUR) -> 2.U
|
|
))
|
|
}
|
|
}
|
|
|
|
// Mapping to reference model param names
|
|
// numLanes: Int, <-> config.NUM_LANES
|
|
// numPerLaneReqs: Int, <-> config.DEPTH
|
|
// sourceWidth: Int, <-> log2ceil(config.NUM_OLD_IDS)
|
|
// sizeWidth: Int, <-> config.sizeEnum.width
|
|
// coalDataWidth: Int, <-> (1 << config.MAX_SIZE)
|
|
// numInflightCoalRequests: Int <-> config.NUM_NEW_IDS
|
|
case class CoalescerConfig(
|
|
enable: Boolean, // globally enable or disable coalescing
|
|
numLanes: Int, // number of lanes (or threads) in a warp
|
|
queueDepth: Int, // request window per lane
|
|
waitTimeout: Int, // max cycles to wait before forced fifo dequeue, per lane
|
|
addressWidth: Int, // assume <= 32
|
|
dataBusWidth: Int, // memory-side downstream TileLink data bus size
|
|
// this has to be at least larger than the word size for
|
|
// the coalescer to perform well
|
|
// watermark = 2, // minimum buffer occupancy to start coalescing
|
|
wordSizeInBytes: Int, // 32-bit system
|
|
numOldSrcIds: Int, // num of outstanding requests per lane, from processor
|
|
numNewSrcIds: Int, // num of outstanding coalesced requests
|
|
respQueueDepth: Int, // depth of the response fifo queues
|
|
coalLogSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers
|
|
// each size is log(byteSize)
|
|
sizeEnum: InFlightTableSizeEnum,
|
|
numCoalReqs: Int, // total number of coalesced requests we can generate in one cycle
|
|
numArbiterOutputPorts: Int, // total of output ports the arbiter will arbitrate into.
|
|
// this has to match downstream cache's configuration
|
|
bankStrideInBytes: Int // cache line strides across the different banks
|
|
) {
|
|
// maximum coalesced size
|
|
def maxCoalLogSize: Int = coalLogSizes.max
|
|
def wordSizeWidth: Int = {
|
|
val w = log2Ceil(wordSizeInBytes)
|
|
require(wordSizeInBytes == 1 << w,
|
|
s"wordSizeInBytes (${wordSizeInBytes}) is not power of two")
|
|
w
|
|
}
|
|
}
|
|
|
|
|
|
object defaultConfig extends CoalescerConfig(
|
|
enable = true,
|
|
numLanes = 4,
|
|
queueDepth = 1,
|
|
waitTimeout = 8,
|
|
addressWidth = 32,
|
|
dataBusWidth = 3, // 2^3=8 bytes, 64 bit bus
|
|
// watermark = 2,
|
|
wordSizeInBytes = 4,
|
|
// when attaching to SoC, 16 source IDs are not enough due to longer latency
|
|
numOldSrcIds = 16,
|
|
numNewSrcIds = 8,
|
|
respQueueDepth = 4,
|
|
coalLogSizes = Seq(3),
|
|
sizeEnum = DefaultInFlightTableSizeEnum,
|
|
numCoalReqs = 1,
|
|
numArbiterOutputPorts = 4,
|
|
bankStrideInBytes = 64 // Current L2 is strided by 512 bits
|
|
)
|
|
|
|
class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends LazyModule {
|
|
// Nexus node that captures the incoming TL requests, rewrites coalescable requests,
|
|
// and arbitrates between non-coalesced and coalesced requests to a fix number of outputs
|
|
// before sending it out to memory. This node is what's visible to upstream and downstream nodes.
|
|
|
|
// WIP:
|
|
// val node = TLNexusNode(
|
|
// clientFn = c => c.head,
|
|
// managerFn = m => m.head // assuming arbiter generated ids are distinct between edges
|
|
// )
|
|
// node.in.map(_._2).foreach(edge => require(edge.manager.beatBytes == config.wordSizeInBytes,
|
|
// s"input edges into coalescer node does not have beatBytes = ${config.wordSizeInBytes}"))
|
|
// node.out.map(_._2).foreach(edge => require(edge.manager.beatBytes == config.maxCoalLogSize,
|
|
// s"output edges into coalescer node does not have beatBytes = ${config.maxCoalLogSize}"))
|
|
|
|
val aggregateNode = TLIdentityNode()
|
|
val cpuNode = TLIdentityNode()
|
|
|
|
// Number of maximum in-flight coalesced requests. The upper bound of this
|
|
// value would be the sourceId range of a single lane.
|
|
val numInflightCoalRequests = config.numNewSrcIds
|
|
|
|
// Master node that actually generates coalesced requests.
|
|
protected val coalParam = Seq(
|
|
TLMasterParameters.v1(
|
|
name = "CoalescerNode",
|
|
sourceId = IdRange(0, numInflightCoalRequests)
|
|
)
|
|
)
|
|
val coalescerNode = TLClientNode(
|
|
Seq(TLMasterPortParameters.v1(coalParam))
|
|
)
|
|
|
|
// merge coalescerNode and cpuNode
|
|
aggregateNode :=* coalescerNode
|
|
aggregateNode :=* TLWidthWidget(config.wordSizeInBytes) :=* cpuNode
|
|
|
|
lazy val module = new CoalescingUnitImp(this, config)
|
|
}
|
|
|
|
// Protocol-agnostic bundles that represent a request and a response to the
|
|
// coalescer.
|
|
|
|
class Request(sourceWidth: Int, sizeWidth: Int, addressWidth: Int, dataWidth: Int)
|
|
extends Bundle {
|
|
require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
|
|
val op = UInt(1.W) // 0=READ 1=WRITE
|
|
val address = UInt(addressWidth.W)
|
|
val size = UInt(sizeWidth.W)
|
|
val source = UInt(sourceWidth.W)
|
|
val mask = UInt((dataWidth / 8).W) // write only
|
|
val data = UInt(dataWidth.W) // write only
|
|
|
|
def toTLA(edgeOut: TLEdgeOut): TLBundleA = {
|
|
val (plegal, pbits) = edgeOut.Put(
|
|
fromSource = this.source,
|
|
toAddress = this.address,
|
|
lgSize = this.size,
|
|
data = this.data
|
|
)
|
|
val (glegal, gbits) = edgeOut.Get(
|
|
fromSource = this.source,
|
|
toAddress = this.address,
|
|
lgSize = this.size
|
|
)
|
|
val legal = Mux(this.op.asBool, plegal, glegal)
|
|
val bits = Mux(this.op.asBool, pbits, gbits)
|
|
assert(legal, "unhandled illegal TL req gen")
|
|
bits
|
|
}
|
|
}
|
|
case class NonCoalescedRequest(config: CoalescerConfig)
|
|
extends Request(
|
|
sourceWidth = log2Ceil(config.numOldSrcIds),
|
|
sizeWidth = config.wordSizeWidth,
|
|
addressWidth = config.addressWidth,
|
|
dataWidth = config.wordSizeInBytes * 8
|
|
)
|
|
case class CoalescedRequest(config: CoalescerConfig)
|
|
extends Request(
|
|
sourceWidth = log2Ceil(config.numNewSrcIds),
|
|
sizeWidth = log2Ceil(config.maxCoalLogSize),
|
|
addressWidth = config.addressWidth,
|
|
dataWidth = (8 * (1 << config.maxCoalLogSize))
|
|
)
|
|
|
|
class Response(sourceWidth: Int, sizeWidth: Int, dataWidth: Int)
|
|
extends Bundle {
|
|
require(dataWidth % 8 == 0, s"dataWidth (${dataWidth} bits) is not multiple of 8")
|
|
val op = UInt(1.W) // 0=READ 1=WRITE
|
|
val size = UInt(sizeWidth.W)
|
|
val source = UInt(sourceWidth.W)
|
|
val data = UInt(dataWidth.W) // read only
|
|
val error = Bool()
|
|
|
|
def toTLD(edgeIn: TLEdgeIn): TLBundleD = {
|
|
val apBits = edgeIn.AccessAck(
|
|
toSource = this.source,
|
|
lgSize = this.size
|
|
)
|
|
val agBits = edgeIn.AccessAck(
|
|
toSource = this.source,
|
|
lgSize = this.size,
|
|
data = this.data
|
|
)
|
|
Mux(this.op.asBool, apBits, agBits)
|
|
}
|
|
|
|
def fromTLD(bundle: TLBundleD): Unit = {
|
|
this.source := bundle.source
|
|
this.op := TLUtils.DOpcodeIsStore(bundle.opcode)
|
|
this.size := bundle.size
|
|
this.data := bundle.data
|
|
this.error := bundle.denied
|
|
}
|
|
}
|
|
case class NonCoalescedResponse(config: CoalescerConfig)
|
|
extends Response(
|
|
sourceWidth = log2Ceil(config.numOldSrcIds),
|
|
sizeWidth = config.wordSizeWidth,
|
|
dataWidth = config.wordSizeInBytes * 8
|
|
)
|
|
case class CoalescedResponse(config: CoalescerConfig)
|
|
extends Response(
|
|
sourceWidth = log2Ceil(config.numNewSrcIds),
|
|
sizeWidth = log2Ceil(config.maxCoalLogSize),
|
|
dataWidth = (8 * (1 << config.maxCoalLogSize))
|
|
)
|
|
|
|
// If `ignoreInUse`, just keep giving out new IDs without checking if it is in
|
|
// use.
|
|
class RoundRobinSourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true)
|
|
extends Module {
|
|
val io = IO(new Bundle {
|
|
val gen = Input(Bool())
|
|
val reclaim = Input(Valid(UInt(sourceWidth.W)))
|
|
val id = Output(Valid(UInt(sourceWidth.W)))
|
|
})
|
|
|
|
val head = RegInit(UInt(sourceWidth.W), 0.U)
|
|
head := Mux(io.gen, head + 1.U, head)
|
|
|
|
val numSourceId = 1 << sourceWidth
|
|
// true: in use, false: available
|
|
val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W)))
|
|
when(reset.asBool) {
|
|
(0 until numSourceId).foreach { i => occupancyTable(i).valid := false.B }
|
|
}
|
|
|
|
io.id.valid := (if (ignoreInUse) true.B else !occupancyTable(head).valid)
|
|
io.id.bits := head
|
|
when(io.gen && io.id.valid /* fire */ ) {
|
|
occupancyTable(io.id.bits).valid := true.B // mark in use
|
|
}
|
|
when(io.reclaim.valid) {
|
|
occupancyTable(io.reclaim.bits).valid := false.B // mark freed
|
|
}
|
|
}
|
|
|
|
class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
|
|
extends Module {
|
|
val io = IO(new Bundle {
|
|
val queue = new Bundle {
|
|
val enq = Vec(config.numLanes, DeqIO(gen.cloneType))
|
|
val deq = Vec(config.numLanes, EnqIO(gen.cloneType))
|
|
}
|
|
val invalidate = Input(Valid(Vec(config.numLanes, UInt(entries.W))))
|
|
val coalescable = Input(Vec(config.numLanes, Bool()))
|
|
val mask = Output(Vec(config.numLanes, UInt(entries.W)))
|
|
val elts = Output(Vec(config.numLanes, Vec(entries, gen)))
|
|
})
|
|
|
|
// val eltPrototype = Wire(Valid(gen))
|
|
// eltPrototype.bits := DontCare
|
|
// eltPrototype.valid := false.B
|
|
|
|
val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen))))
|
|
val writePtr = RegInit(
|
|
VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W)))
|
|
)
|
|
val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B)))
|
|
|
|
private def resetElts = {
|
|
elts.foreach { laneQ =>
|
|
laneQ.foreach { entry =>
|
|
entry.valid := false.B
|
|
entry.bits := DontCare
|
|
}
|
|
}
|
|
}
|
|
when(reset.asBool) {
|
|
resetElts
|
|
}
|
|
|
|
val controlSignals = Wire(Vec(config.numLanes, new Bundle {
|
|
val shift = Bool()
|
|
val full = Bool()
|
|
val empty = Bool()
|
|
}))
|
|
|
|
// io.coalescable will first turn on for all coalescable chunks, and turn off
|
|
// incrementally as time goes on. Therefore, when io.coalescable is all
|
|
// turned off, that means we have processed all coalescable chunks at the
|
|
// current cycle.
|
|
//
|
|
// shift hint is when the heads have no more coalescable left this or next cycle
|
|
val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0)))
|
|
.map { case (c, inv) =>
|
|
c && !(io.invalidate.valid && inv)
|
|
}
|
|
.reduce(_ || _)
|
|
val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
|
|
// valid && !fire means we enable enqueueing to a full queue, provided the
|
|
// arbiter is taking away all remaining valid queue heads in the next cycle so
|
|
// that we make space for the entire next warp.
|
|
val syncedDeqValidNextCycle =
|
|
io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _)
|
|
|
|
for (i <- 0 until config.numLanes) {
|
|
val enq = io.queue.enq(i)
|
|
val deq = io.queue.deq(i)
|
|
val ctrl = controlSignals(i)
|
|
|
|
ctrl.full := writePtr(i) === entries.U
|
|
ctrl.empty := writePtr(i) === 0.U
|
|
// shift when no outstanding dequeue, no more coalescable chunks, and not empty
|
|
ctrl.shift := !syncedDeqValidNextCycle && shiftHint && !ctrl.empty
|
|
|
|
// dequeue is valid when:
|
|
// head entry is valid, has not been processed by downstream, and is not coalescable
|
|
deq.bits := elts.map(_.head.bits)(i)
|
|
deq.valid := elts.map(_.head.valid)(i) && !deqDone(i) && !io.coalescable(i)
|
|
|
|
// can take new entries if not empty, or if full but shifting
|
|
enq.ready := (!ctrl.full) || ctrl.shift
|
|
|
|
when(ctrl.shift) {
|
|
// shift, invalidate tail, invalidate coalesced requests
|
|
elts(i).zipWithIndex.foreach { case (elt, j) =>
|
|
if (j == entries - 1) { // tail
|
|
elt.valid := false.B
|
|
} else {
|
|
elt.bits := elts(i)(j + 1).bits
|
|
elt.valid := elts(i)(
|
|
j + 1
|
|
).valid && !(io.invalidate.valid && io.invalidate.bits(i)(j + 1))
|
|
}
|
|
}
|
|
// reset dequeue mask when new entries are shifted in
|
|
deqDone(i) := false.B
|
|
// enqueue
|
|
when(enq.ready && syncedEnqValid) { // to allow drift, swap for enq.fire
|
|
elts(i)(writePtr(i) - 1.U).bits := enq.bits
|
|
elts(i)(writePtr(i) - 1.U).valid := enq.valid
|
|
}.otherwise {
|
|
writePtr(i) := writePtr(i) - 1.U
|
|
}
|
|
}.otherwise {
|
|
// invalidate coalesced requests
|
|
when(io.invalidate.valid) {
|
|
(elts(i) zip io.invalidate.bits(i).asBools).map { case (elt, inv) =>
|
|
elt.valid := elt.valid && !inv
|
|
}
|
|
}
|
|
// enqueue
|
|
when(enq.ready && syncedEnqValid) {
|
|
elts(i)(writePtr(i)).bits := enq.bits
|
|
elts(i)(writePtr(i)).valid := enq.valid
|
|
writePtr(i) := writePtr(i) + 1.U
|
|
}
|
|
deqDone(i) := deqDone(i) || deq.fire
|
|
}
|
|
}
|
|
|
|
// When doing spatial-only coalescing, queues should never drift from each
|
|
// other, i.e. the queue heads should always contain mem requests from the
|
|
// same instruction.
|
|
val queueInSync =
|
|
controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
|
|
writePtr.map(_ === writePtr.head).reduce(_ && _)
|
|
assert(queueInSync, "shift queue lanes are not in sync")
|
|
|
|
io.mask := elts.map(x => VecInit(x.map(_.valid)).asUInt)
|
|
io.elts := elts.map(x => VecInit(x.map(_.bits)))
|
|
}
|
|
|
|
// Software model: coalescer.py
|
|
class MonoCoalescer(
|
|
config: CoalescerConfig,
|
|
coalLogSize: Int,
|
|
queueT: CoalShiftQueue[NonCoalescedRequest]
|
|
) extends Module {
|
|
val io = IO(new Bundle {
|
|
val window = Input(queueT.io.cloneType)
|
|
val results = Output(new Bundle {
|
|
val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W))
|
|
val baseAddr = Output(UInt(config.addressWidth.W))
|
|
val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
|
|
// number of entries matched with this leader lane's head.
|
|
// maximum is numLanes * queueDepth
|
|
val matchCount =
|
|
Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W))
|
|
val coverageHits =
|
|
Output(UInt((config.maxCoalLogSize - config.wordSizeWidth + 1).W))
|
|
val canCoalesce = Output(Vec(config.numLanes, Bool()))
|
|
})
|
|
})
|
|
|
|
io := DontCare
|
|
|
|
// Combinational logic to drive output from window contents.
|
|
// The leader lanes only compare their heads against all entries of the
|
|
// follower lanes.
|
|
val leaders = io.window.elts.map(_.head)
|
|
val leadersValid = io.window.mask.map(_.asBools.head)
|
|
|
|
def printQueueHeads = {
|
|
leaders.zipWithIndex.foreach { case (head, i) =>
|
|
printf(
|
|
s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
|
|
leadersValid(i),
|
|
head.source,
|
|
head.address
|
|
)
|
|
}
|
|
}
|
|
// when (leadersValid.reduce(_ || _)) {
|
|
// printQueueHeads
|
|
// }
|
|
|
|
val size = coalLogSize
|
|
// NOTE: be careful with Scala integer overflow when addressWidth >= 32
|
|
val addrMask = (((1L << config.addressWidth) - 1) - ((1 << size) - 1)).U
|
|
def canMatch(req0: Request, req0v: Bool, req1: Request, req1v: Bool): Bool = {
|
|
(req0.op === req1.op) &&
|
|
(req0v && req1v) &&
|
|
((req0.address & this.addrMask) === (req1.address & this.addrMask))
|
|
}
|
|
|
|
// Gives a 2-D table of Bools representing match at every queue entry,
|
|
// for each lane (so 3-D in total).
|
|
// dimensions: (leader lane, follower lane, follower entry)
|
|
val matchTablePerLane = (leaders zip leadersValid).map {
|
|
case (leader, leaderValid) =>
|
|
(io.window.elts zip io.window.mask).map {
|
|
case (followers, followerValids) =>
|
|
// compare leader's head against follower's every queue entry
|
|
(followers zip followerValids.asBools).map {
|
|
case (follower, followerValid) =>
|
|
canMatch(follower, followerValid, leader, leaderValid)
|
|
// FIXME: disabling halving optimization because it does not give the
|
|
// correct per-lane coalescable indication to the shift queue
|
|
// // match leader to only followers at lanes >= leader idx
|
|
// // this halves the number of comparators
|
|
// if (followerIndex < leaderIndex) false.B
|
|
// else canMatch(follower, followerValid, leader, leaderValid)
|
|
}
|
|
}
|
|
}
|
|
|
|
val matchCounts = matchTablePerLane.map(table =>
|
|
table
|
|
.map(PopCount(_)) // sum up each column
|
|
.reduce(_ +& _)
|
|
)
|
|
val canCoalesce = matchCounts.map(_ > 1.U)
|
|
|
|
// Elect the leader that has the most match counts.
|
|
// TODO: potentially expensive: magnitude comparator
|
|
def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
|
|
matchCounts.zipWithIndex
|
|
.map { case (c, i) =>
|
|
(c, i.U)
|
|
}
|
|
.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
|
|
(Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
|
|
}
|
|
._2
|
|
}
|
|
// Elect leader by choosing the smallest-index lane that has a valid
|
|
// match, i.e. using priority encoder.
|
|
def chooseLeaderPriorityEncoder(matchCounts: Seq[UInt]): UInt = {
|
|
PriorityEncoder(matchCounts.map(_ > 1.U))
|
|
}
|
|
val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts)
|
|
|
|
val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux
|
|
// matchTable for the chosen lane, but each column converted to bitflags,
|
|
// i.e. Vec[UInt]
|
|
val chosenMatches = VecInit(matchTablePerLane.map { table =>
|
|
VecInit(table.map(VecInit(_).asUInt))
|
|
})(chosenLeaderIdx)
|
|
val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)
|
|
|
|
// coverage calculation
|
|
def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordSizeWidth)
|
|
// 2-D table flattened to 1-D
|
|
val offsets =
|
|
io.window.elts.flatMap(_.map(req => getOffsetSlice(req.address)))
|
|
val valids = chosenMatches.flatMap(_.asBools)
|
|
// indicates for each word in the coalesced chunk whether it is accessed by
|
|
// any of the requests in the queue. e.g. if [ 1 1 1 1 ], all of the four
|
|
// words in the coalesced data coming back will be accessed by some request
|
|
// and we've reached 100% bandwidth utilization.
|
|
val hits = Seq.tabulate(1 << (size - config.wordSizeWidth)) { target =>
|
|
(offsets zip valids)
|
|
.map { case (offset, valid) => valid && (offset === target.U) }
|
|
.reduce(_ || _)
|
|
}
|
|
|
|
// debug prints
|
|
when(leadersValid.reduce(_ || _)) {
|
|
matchCounts.zipWithIndex.foreach { case (count, i) =>
|
|
printf(s"lane[${i}] matchCount = %d\n", count);
|
|
}
|
|
printf("chosenLeader = lane %d\n", chosenLeaderIdx)
|
|
printf("chosenLeader matches = [ ")
|
|
chosenMatches.foreach { m => printf("%d ", m) }
|
|
printf("]\n")
|
|
printf("chosenMatchCount = %d\n", chosenMatchCount)
|
|
|
|
printf("hits = [ ")
|
|
hits.foreach { m => printf("%d ", m) }
|
|
printf("]\n")
|
|
}
|
|
|
|
io.results.leaderIdx := chosenLeaderIdx
|
|
io.results.baseAddr := chosenLeader.address & addrMask
|
|
io.results.matchOH := chosenMatches
|
|
io.results.matchCount := chosenMatchCount
|
|
io.results.coverageHits := PopCount(hits)
|
|
io.results.canCoalesce := canCoalesce
|
|
}
|
|
|
|
// Combinational logic that generates a coalesced request given a request
|
|
// window, and a selection of possible coalesced sizes. May utilize multiple
|
|
// MonoCoalescers and apply size-choosing policy to determine the final
|
|
// coalesced request out of all possible combinations.
|
|
//
|
|
// Software model: coalescer.py
|
|
class MultiCoalescer(
|
|
config: CoalescerConfig,
|
|
queueT: CoalShiftQueue[NonCoalescedRequest],
|
|
coalReqT: Request,
|
|
) extends Module {
|
|
val invalidateT = Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))
|
|
val io = IO(new Bundle {
|
|
// coalescing window, connected to the contents of the request queues
|
|
val window = Input(queueT.io.cloneType)
|
|
// generated coalesced request
|
|
val coalReq = DecoupledIO(coalReqT.cloneType)
|
|
// invalidate signals going into each request queue's head. Lanes with
|
|
// high invalidate bits are what became coalesced into the new request.
|
|
val invalidate = Output(invalidateT)
|
|
// whether a lane is coalescable. This is used to output non-coalescable
|
|
// lanes to the arbiter so they can be flushed to downstream.
|
|
val coalescable = Output(Vec(config.numLanes, Bool()))
|
|
})
|
|
|
|
val coalescers = config.coalLogSizes.map(size =>
|
|
Module(new MonoCoalescer(config, size, queueT))
|
|
)
|
|
coalescers.foreach(_.io.window := io.window)
|
|
|
|
def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
|
|
(valPerSize zip config.coalLogSizes).map { case (hits, size) =>
|
|
(hits << (config.maxCoalLogSize - size).U).asUInt
|
|
}
|
|
}
|
|
|
|
def argMax(x: Seq[UInt]): UInt = {
|
|
x.zipWithIndex.map {
|
|
case (a, b) => (a, b.U)
|
|
}.reduce[(UInt, UInt)] { case ((a, i), (b, j)) =>
|
|
(Mux(a > b, a, b), Mux(a > b, i, j)) // > instead of >= here; want to use largest size
|
|
}._2
|
|
}
|
|
|
|
// normalize to maximum coalescing size so that we can do fair comparisons
|
|
// between coalescing results of different sizes
|
|
val normalizedMatches = normalize(coalescers.map(_.io.results.matchCount))
|
|
val normalizedHits = normalize(coalescers.map(_.io.results.coverageHits))
|
|
|
|
val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
|
|
val chosenValid = Wire(Bool())
|
|
// minimum 25% coverage
|
|
val minCoverage =
|
|
1.max(1 << ((config.maxCoalLogSize - config.wordSizeWidth) - 2))
|
|
|
|
when(normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
|
|
chosenSizeIdx := argMax(normalizedHits)
|
|
chosenValid := true.B
|
|
printf("coalescing success by coverage policy\n")
|
|
}.elsewhen(normalizedMatches.map(_ > 1.U).reduce(_ || _)) {
|
|
chosenSizeIdx := argMax(normalizedMatches)
|
|
chosenValid := true.B
|
|
printf("coalescing success by matches policy\n")
|
|
}.otherwise {
|
|
chosenSizeIdx := DontCare
|
|
chosenValid := false.B
|
|
}
|
|
|
|
def debugPolicyPrint() = {
|
|
printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount)
|
|
printf("normalizedMatches[0]=%d\n", normalizedMatches(0))
|
|
printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits)
|
|
printf("normalizedHits[0]=%d\n", normalizedHits(0))
|
|
printf("minCoverage=%d\n", minCoverage.U)
|
|
}
|
|
|
|
// create coalesced request
|
|
val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenSizeIdx)
|
|
val chosenSize = VecInit(coalescers.map(_.size.U))(chosenSizeIdx)
|
|
|
|
// flatten requests and matches
|
|
val flatReqs = io.window.elts.flatten
|
|
val flatMatches = chosenBundle.matchOH.flatMap(_.asBools)
|
|
|
|
// check for word alignment in addresses
|
|
assert(
|
|
io.window.elts
|
|
.flatMap(_.map(req => req.address(config.wordSizeWidth - 1, 0) === 0.U))
|
|
.zip(io.window.mask.flatMap(_.asBools))
|
|
.map { case (aligned, valid) => (!valid) || aligned }
|
|
.reduce(_ || _),
|
|
"one or more addresses used for coalescing is not word-aligned"
|
|
)
|
|
|
|
// note: this is word-level coalescing. if finer granularity is needed, need to modify code
|
|
val numWords = (1.U << (chosenSize - config.wordSizeWidth.U)).asUInt
|
|
val maxWords = 1 << (config.maxCoalLogSize - config.wordSizeWidth)
|
|
val addrMask = Wire(UInt(config.maxCoalLogSize.W))
|
|
addrMask := (1.U << chosenSize).asUInt - 1.U
|
|
|
|
val data = Wire(Vec(maxWords, UInt((config.wordSizeInBytes * 8).W)))
|
|
val mask = Wire(Vec(maxWords, UInt(config.wordSizeInBytes.W)))
|
|
|
|
for (i <- 0 until maxWords) {
|
|
val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
|
|
// note: ANDing against addrMask is to conform to active byte lanes requirements
|
|
// if aligning to LSB suffices, we should add the bitwise AND back
|
|
m && ((req.address(
|
|
config.maxCoalLogSize - 1,
|
|
config.wordSizeWidth
|
|
) /* & addrMask*/ ) === i.U)
|
|
}
|
|
// TODO: SW uses priority encoder, not sure about behavior of MuxCase
|
|
data(i) := MuxCase(
|
|
DontCare,
|
|
flatReqs.zip(sel).map { case (req, s) =>
|
|
s -> req.data
|
|
}
|
|
)
|
|
mask(i) := MuxCase(
|
|
0.U,
|
|
flatReqs.zip(sel).map { case (req, s) =>
|
|
s -> req.mask
|
|
}
|
|
)
|
|
}
|
|
|
|
val coalesceValid = chosenValid
|
|
|
|
// setting source is deferred, because in order to do proper source ID
|
|
// generation we also have to look at the responses coming back, which
|
|
// is easier to do at the toplevel.
|
|
io.coalReq.bits.source := DontCare
|
|
io.coalReq.bits.mask := mask.asUInt
|
|
io.coalReq.bits.data := data.asUInt
|
|
io.coalReq.bits.size := chosenSize
|
|
io.coalReq.bits.address := chosenBundle.baseAddr
|
|
io.coalReq.bits.op := io.window.elts(chosenBundle.leaderIdx).head.op
|
|
io.coalReq.valid := coalesceValid
|
|
|
|
io.invalidate.bits := chosenBundle.matchOH
|
|
io.invalidate.valid := io.coalReq.fire // invalidate only when fire
|
|
|
|
io.coalescable := coalescers
|
|
.map(_.io.results.canCoalesce.asUInt)
|
|
.reduce(_ | _)
|
|
.asBools
|
|
|
|
dontTouch(io.invalidate) // debug
|
|
|
|
def disable = {
|
|
io.coalReq.valid := false.B
|
|
io.invalidate.valid := false.B
|
|
io.coalescable.foreach { _ := false.B }
|
|
}
|
|
if (!config.enable) disable
|
|
}
|
|
|
|
class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
|
|
extends LazyModuleImp(outer) {
|
|
require(
|
|
outer.cpuNode.in.length == config.numLanes,
|
|
s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as " +
|
|
s"config.numLanes (${config.numLanes})"
|
|
)
|
|
require(
|
|
outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds),
|
|
s"TL param sourceBits (${outer.cpuNode.in.head._1.params.sourceBits}) " +
|
|
s"mismatch with log2(config.numOldSrcIds) (${log2Ceil(config.numOldSrcIds)})"
|
|
)
|
|
require(
|
|
outer.cpuNode.in.head._1.params.addressBits == config.addressWidth,
|
|
s"TL param addressBits (${outer.cpuNode.in.head._1.params.addressBits}) " +
|
|
s"mismatch with config.addressWidth (${config.addressWidth})"
|
|
)
|
|
require(
|
|
config.maxCoalLogSize <= config.dataBusWidth,
|
|
"multi-beat coalesced reads/writes are currently not supported"
|
|
)
|
|
|
|
val oldSourceWidth = outer.cpuNode.in.head._1.params.sourceBits
|
|
val nonCoalReqT = new NonCoalescedRequest(config)
|
|
val reqQueues = Module(
|
|
new CoalShiftQueue(nonCoalReqT, config.queueDepth, config)
|
|
)
|
|
|
|
val coalReqT = new CoalescedRequest(config)
|
|
val coalescer = Module(new MultiCoalescer(config, reqQueues, coalReqT))
|
|
coalescer.io.window := reqQueues.io
|
|
reqQueues.io.coalescable := coalescer.io.coalescable
|
|
reqQueues.io.invalidate := coalescer.io.invalidate
|
|
|
|
val uncoalescer = Module(new Uncoalescer(config, nonCoalReqT, coalReqT))
|
|
|
|
// ===========================================================================
|
|
// Request flow
|
|
// ===========================================================================
|
|
//
|
|
// Override IdentityNode implementation so that we can instantiate
|
|
// queues between input and output edges to buffer requests and responses.
|
|
// See IdentityNode definition in `diplomacy/Nodes.scala`.
|
|
//
|
|
(outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
|
|
case (((tlIn, _), (tlOut, edgeOut)), lane) =>
|
|
// Request queue
|
|
val req = Wire(nonCoalReqT)
|
|
|
|
req.op := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
|
|
req.source := tlIn.a.bits.source
|
|
req.address := tlIn.a.bits.address
|
|
req.data := tlIn.a.bits.data
|
|
req.size := tlIn.a.bits.size
|
|
// FIXME: req.data is still containing TL-aligned data. This is fine if
|
|
// we're simply passing through this data out the other end, but not if
|
|
// the outgoing TL edge (tlOut) has different data width from the incoming
|
|
// edge (tlIn). Possible TODO to only store the relevant portion of the
|
|
// data, at the cost of re-aligning at the outgoing end.
|
|
req.mask := tlIn.a.bits.mask
|
|
|
|
val enq = reqQueues.io.queue.enq(lane)
|
|
val deq = reqQueues.io.queue.deq(lane)
|
|
enq.valid := tlIn.a.valid
|
|
enq.bits := req
|
|
// Only allow dequeue when uncoalescer is ready to record the current
|
|
// queue entries
|
|
// TODO: deq.ready should also respect downstream arbiter
|
|
deq.ready := uncoalescer.io.coalReq.ready
|
|
// Stall upstream core or memtrace driver when shiftqueue is not ready
|
|
tlIn.a.ready := enq.ready
|
|
tlOut.a.valid := deq.valid
|
|
tlOut.a.bits := deq.bits.toTLA(edgeOut)
|
|
|
|
// debug
|
|
// when (tlIn.a.valid) {
|
|
// TLPrintf(s"tlIn(${lane}).a",
|
|
// tlIn.a.bits.address,
|
|
// tlIn.a.bits.size,
|
|
// tlIn.a.bits.mask,
|
|
// TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode),
|
|
// tlIn.a.bits.data,
|
|
// 0.U
|
|
// )
|
|
// }
|
|
// when (tlOut.a.valid) {
|
|
// TLPrintf(s"tlOut(${lane}).a",
|
|
// tlOut.a.bits.address,
|
|
// tlOut.a.bits.size,
|
|
// tlOut.a.bits.mask,
|
|
// TLUtils.AOpcodeIsStore(tlOut.a.bits.opcode),
|
|
// tlOut.a.bits.data,
|
|
// 0.U
|
|
// )
|
|
// }
|
|
}
|
|
|
|
val (tlCoal, edgeCoal) = outer.coalescerNode.out.head
|
|
|
|
val sourceGen = Module(
|
|
new RoundRobinSourceGenerator(log2Ceil(config.numNewSrcIds), ignoreInUse = false)
|
|
)
|
|
sourceGen.io.gen := coalescer.io.coalReq.fire // use up a source ID only when request is created
|
|
sourceGen.io.reclaim.valid := tlCoal.d.valid
|
|
sourceGen.io.reclaim.bits := tlCoal.d.bits.source
|
|
|
|
val coalReqValid = coalescer.io.coalReq.valid && sourceGen.io.id.valid
|
|
tlCoal.a.valid := coalReqValid
|
|
tlCoal.a.bits := coalescer.io.coalReq.bits.toTLA(edgeCoal)
|
|
tlCoal.a.bits.source := sourceGen.io.id.bits
|
|
|
|
coalescer.io.coalReq.ready := tlCoal.a.ready
|
|
tlCoal.b.ready := true.B
|
|
tlCoal.c.valid := false.B
|
|
// tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.
|
|
tlCoal.e.valid := false.B
|
|
|
|
require(
|
|
tlCoal.params.sourceBits == log2Ceil(config.numNewSrcIds),
|
|
s"tlCoal param `sourceBits` (${tlCoal.params.sourceBits}) mismatches coalescer constant"
|
|
+ s" (${log2Ceil(config.numNewSrcIds)})"
|
|
)
|
|
|
|
require(
|
|
tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8,
|
|
s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant"
|
|
+ s" (${(1 << config.dataBusWidth) * 8})"
|
|
)
|
|
|
|
// ===========================================================================
|
|
// Response flow
|
|
// ===========================================================================
|
|
//
|
|
// Connect uncoalescer output and noncoalesced response ports to the response
|
|
// queues.
|
|
|
|
// The maximum number of requests from a single lane that can go into a
|
|
// coalesced request.
|
|
val numPerLaneReqs = config.queueDepth
|
|
|
|
// FIXME: no need to contain maxCoalLogSize data
|
|
val respQueueEntryT = new Response(
|
|
oldSourceWidth,
|
|
log2Ceil(config.maxCoalLogSize),
|
|
(1 << config.maxCoalLogSize) * 8
|
|
)
|
|
val respQueues = Seq.tabulate(config.numLanes) { _ =>
|
|
Module(
|
|
new MultiPortQueue(
|
|
respQueueEntryT,
|
|
// enq_lanes = 1 + M, where 1 is the response for the original per-lane
|
|
// requests that didn't get coalesced, and M is the maximum number of
|
|
// single-lane requests that can go into a coalesced request.
|
|
// (`numPerLaneReqs`).
|
|
// TODO: potentially expensive, because this generates more FFs.
|
|
// Rather than enqueueing all responses in a single cycle, consider
|
|
// enqueueing one by one (at the cost of possibly stalling downstream).
|
|
1 + numPerLaneReqs,
|
|
// deq_lanes = 1 because we're serializing all responses to 1 port that
|
|
// goes back to the core.
|
|
1,
|
|
// lanes. Has to be at least max(enq_lanes, deq_lanes)
|
|
1 + numPerLaneReqs,
|
|
// Depth of each lane queue.
|
|
// XXX queue depth is set to an arbitrarily high value that doesn't
|
|
// make queue block up in the middle of the simulation. Ideally there
|
|
// should be a more logical way to set this, or we should handle
|
|
// response queue blocking.
|
|
config.respQueueDepth
|
|
)
|
|
)
|
|
}
|
|
val respQueueNoncoalPort = 0
|
|
val respQueueUncoalPortOffset = 1
|
|
|
|
(outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
|
|
case (((tlIn, edgeIn), (tlOut, _)), lane) =>
|
|
// Response queue
|
|
//
|
|
// This queue will serialize non-coalesced responses along with
|
|
// coalesced responses and serve them back to the core side.
|
|
val respQueue = respQueues(lane)
|
|
val resp = Wire(respQueueEntryT)
|
|
resp.fromTLD(tlOut.d.bits)
|
|
|
|
// Queue up responses that didn't get coalesced originally ("noncoalesced" responses).
|
|
// Coalesced (but uncoalesced back) responses will also be enqueued into the same queue.
|
|
assert(
|
|
respQueue.io.enq(respQueueNoncoalPort).ready,
|
|
"respQueue: enq port for noncoalesced response is blocked"
|
|
)
|
|
respQueue.io.enq(respQueueNoncoalPort).valid := tlOut.d.valid
|
|
respQueue.io.enq(respQueueNoncoalPort).bits := resp
|
|
// TODO: deq.ready should respect upstream ready
|
|
respQueue.io.deq(respQueueNoncoalPort).ready := true.B
|
|
|
|
tlIn.d.valid := respQueue.io.deq(respQueueNoncoalPort).valid
|
|
tlIn.d.bits := respQueue.io.deq(respQueueNoncoalPort).bits.toTLD(edgeIn)
|
|
|
|
// Debug only
|
|
val inflightCounter = RegInit(UInt(32.W), 0.U)
|
|
when(tlOut.a.valid) {
|
|
// don't inc/dec on simultaneous req/resp
|
|
when(!tlOut.d.valid) {
|
|
inflightCounter := inflightCounter + 1.U
|
|
}
|
|
}.elsewhen(tlOut.d.valid) {
|
|
inflightCounter := inflightCounter - 1.U
|
|
}
|
|
|
|
dontTouch(inflightCounter)
|
|
dontTouch(tlIn.a)
|
|
dontTouch(tlIn.d)
|
|
dontTouch(tlOut.a)
|
|
dontTouch(tlOut.d)
|
|
}
|
|
|
|
// connect coalesced request that is newly generated and being recorded in
|
|
// the uncoalescer
|
|
uncoalescer.io.coalReq <> coalescer.io.coalReq
|
|
// We can't simply use coalescer.io.coalReq.valid here.
|
|
// coalescer.io.coalReq.valid tells us when there exists a valid coalescing
|
|
// combination, but not when we can actually fire that to downstream, because
|
|
// we can still be blocked by source ID clashes due to backpressure.
|
|
// So, we have to overwrite just the valid bit with the final valid that
|
|
// indicates when we can send this request out.
|
|
// NOTE(hansung): this feels slightly awkward. Maybe doing sourcegen inside
|
|
// the coalescer so that it gives the final call is better, but that may be
|
|
// too much IO for the coalescer.
|
|
uncoalescer.io.coalReq.valid := coalReqValid
|
|
uncoalescer.io.invalidate := coalescer.io.invalidate
|
|
val reqQueueHeads = reqQueues.io.queue.deq.map(_.bits)
|
|
uncoalescer.io.windowElts := reqQueues.io.elts
|
|
// connect coalesced response going into the uncoalescer, ready to be
|
|
// uncoalesced
|
|
// Cleanup: custom <>?
|
|
uncoalescer.io.coalResp.valid := tlCoal.d.valid
|
|
uncoalescer.io.coalResp.bits.fromTLD(tlCoal.d.bits)
|
|
// uncoalescer backpressure
|
|
tlCoal.d.ready := uncoalescer.io.coalResp.ready
|
|
|
|
// Connect uncoalescer results back into each lane's response queue
|
|
(respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach {
|
|
case ((q, perLaneResps), lane) =>
|
|
perLaneResps.zipWithIndex.foreach { case (resp, i) =>
|
|
// TODO: rather than crashing, deassert tlOut.d.ready to stall downtream
|
|
// cache. This should ideally not happen though.
|
|
assert(
|
|
q.io.enq(respQueueUncoalPortOffset + i).ready,
|
|
s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}"
|
|
)
|
|
q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid
|
|
q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits
|
|
// debug
|
|
// when (resp.valid) {
|
|
// printf(s"${i}-th uncoalesced response came back from lane ${lane}\n")
|
|
// }
|
|
// dontTouch(q.io.enq(respQueueCoalPortOffset))
|
|
}
|
|
}
|
|
|
|
// Debug
|
|
dontTouch(coalescer.io.coalReq)
|
|
val coalRespData = tlCoal.d.bits.data
|
|
dontTouch(coalRespData)
|
|
|
|
dontTouch(tlCoal.a)
|
|
dontTouch(tlCoal.d)
|
|
}
|
|
|
|
class Uncoalescer(
|
|
config: CoalescerConfig,
|
|
nonCoalReqT: NonCoalescedRequest,
|
|
coalReqT: CoalescedRequest,
|
|
) extends Module {
|
|
val inflightTable = Module(new InflightCoalReqTable(config))
|
|
val io = IO(new Bundle {
|
|
// generated coalesced request, connected to the output of the coalescer.
|
|
val coalReq = Flipped(DecoupledIO(coalReqT.cloneType))
|
|
// invalidate signal coming out of coalescer.
|
|
val invalidate = Input(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
|
|
// coalescing window, connected to the contents of the request queues.
|
|
// Uncoalescer looks at the queue entries that got coalesced into `coalReq`
|
|
// in order to record which lanes this coalReq originally came from.
|
|
// We only care about window.elts because the coalescer would have made
|
|
// sure it only looked at the valid entries.
|
|
// TODO: duplicate type construction
|
|
val windowElts = Input(Vec(config.numLanes, Vec(config.queueDepth, nonCoalReqT)))
|
|
val coalResp = Flipped(Decoupled(new CoalescedResponse(config)))
|
|
val uncoalResps = Output(
|
|
Vec(
|
|
config.numLanes,
|
|
Vec(config.queueDepth, ValidIO(new NonCoalescedResponse(config)))
|
|
)
|
|
)
|
|
})
|
|
|
|
// If inflight table is full, we cannot accept new requests to record them.
|
|
// This might happen when we sent out many requests and exhausted all source
|
|
// IDs, but they haven't come back yet.
|
|
io.coalReq.ready := inflightTable.io.enq.ready
|
|
|
|
// Construct a new entry for the inflight table using generated coalesced request
|
|
def generateInflightTableEntry: InflightCoalReqTableEntry = {
|
|
val newEntry = Wire(inflightTable.entryT)
|
|
newEntry.source := io.coalReq.bits.source
|
|
// Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the
|
|
// coalescer to every (numLanes * queueDepth) entry in the inflight table.
|
|
(newEntry.lanes zip io.invalidate.bits).zipWithIndex
|
|
.foreach { case ((laneEntry, laneInv), lane) =>
|
|
(laneEntry.reqs zip laneInv.asBools).zipWithIndex
|
|
.foreach { case ((reqEntry, inv), i) =>
|
|
val req = io.windowElts(lane)(i)
|
|
when((io.invalidate.valid && inv)) {
|
|
printf(
|
|
s"coalescer: reqQueue($lane)($i) got invalidated (source=%d)\n",
|
|
req.source
|
|
)
|
|
}
|
|
reqEntry.valid := (io.invalidate.valid && inv)
|
|
reqEntry.source := req.source
|
|
reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordSizeWidth)
|
|
reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size)
|
|
// TODO: load/store op
|
|
}
|
|
}
|
|
assert(
|
|
!((io.coalReq.valid === true.B) && (io.coalResp.valid === true.B) &&
|
|
(newEntry.source === io.coalResp.bits.source)),
|
|
"inflight table: enqueueing and looking up the same srcId at the same cycle is not handled"
|
|
)
|
|
dontTouch(newEntry)
|
|
|
|
newEntry
|
|
}
|
|
inflightTable.io.enq.valid := io.coalReq.valid
|
|
inflightTable.io.enq.bits := generateInflightTableEntry
|
|
|
|
// Look up the table with incoming coalesced responses
|
|
inflightTable.io.lookup.ready := io.coalResp.valid
|
|
inflightTable.io.lookupSourceId := io.coalResp.bits.source
|
|
io.coalResp.ready := true.B // FIXME, see sw model implementation
|
|
|
|
// Un-coalescing logic
|
|
//
|
|
def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, logSize: UInt): UInt = {
|
|
assert(logSize === 2.U, "currently only supporting 4-byte accesses. TODO")
|
|
|
|
// sizeInBits should be simulation-only construct
|
|
val sizeInBits = ((1.U << logSize) << 3.U).asUInt
|
|
assert(
|
|
(dataWidth > 0).B && (dataWidth.U % sizeInBits === 0.U),
|
|
s"coalesced data width ($dataWidth) not evenly divisible by core req size ($sizeInBits)"
|
|
)
|
|
|
|
val numChunks = dataWidth / 32
|
|
val chunks = Wire(Vec(numChunks, UInt(32.W)))
|
|
val offsets = (0 until numChunks)
|
|
(chunks zip offsets).foreach { case (c, o) =>
|
|
// FIXME: whether to take the offset from MSB or LSB depends on
|
|
// endianness. Right now we're assuming little endian
|
|
c := data(32 * (o + 1) - 1, 32 * o)
|
|
// If taking from MSB:
|
|
// c := (data >> (dataWidth - (o + 1) * 32)) & sizeMask
|
|
}
|
|
chunks(offset) // MUX
|
|
}
|
|
|
|
// Un-coalesce responses back to individual lanes
|
|
val found = inflightTable.io.lookup.bits
|
|
(found.lanes zip io.uncoalResps).foreach { case (perLane, ioPerLane) =>
|
|
perLane.reqs.zipWithIndex.foreach { case (oldReq, depth) =>
|
|
val ioOldReq = ioPerLane(depth)
|
|
|
|
// TODO: spatial-only coalescing: only looking at 0th srcId entry
|
|
ioOldReq.valid := false.B
|
|
ioOldReq.bits := DontCare
|
|
|
|
when(inflightTable.io.lookup.valid && oldReq.valid) {
|
|
ioOldReq.valid := oldReq.valid
|
|
ioOldReq.bits.source := oldReq.source
|
|
val logSize = found.sizeEnumT.enumToLogSize(oldReq.sizeEnum)
|
|
ioOldReq.bits.size := logSize
|
|
ioOldReq.bits.data :=
|
|
getCoalescedDataChunk(
|
|
io.coalResp.bits.data,
|
|
io.coalResp.bits.data.getWidth,
|
|
oldReq.offset,
|
|
logSize
|
|
)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// InflightCoalReqTable is a table structure that records
|
|
// for each unanswered coalesced request which lane the request originated
|
|
// from, what their original TileLink sourceId were, etc. We use this info to
|
|
// split the coalesced response back to individual per-lane responses with the
|
|
// right metadata.
|
|
class InflightCoalReqTable(config: CoalescerConfig) extends Module {
|
|
val offsetBits =
|
|
config.maxCoalLogSize - config.wordSizeWidth // assumes word offset
|
|
val entryT = new InflightCoalReqTableEntry(
|
|
config.numLanes,
|
|
config.queueDepth,
|
|
log2Ceil(config.numOldSrcIds),
|
|
config.maxCoalLogSize,
|
|
config.sizeEnum
|
|
)
|
|
|
|
val entries = config.numNewSrcIds
|
|
val sourceWidth = log2Ceil(config.numOldSrcIds)
|
|
|
|
println(s"=========== table sourceWidth: ${sourceWidth}")
|
|
println(s"=========== table offsetBits: ${offsetBits}")
|
|
println(s"=========== table sizeEnumBits: ${entryT.sizeEnumT.getWidth}")
|
|
|
|
val io = IO(new Bundle {
|
|
val enq = Flipped(Decoupled(entryT))
|
|
// TODO: return actual stuff
|
|
val lookup = Decoupled(entryT)
|
|
// TODO: put this inside decoupledIO
|
|
val lookupSourceId = Input(UInt(sourceWidth.W))
|
|
})
|
|
|
|
val table = Mem(
|
|
entries,
|
|
new Bundle {
|
|
val valid = Bool()
|
|
val bits = entryT.cloneType
|
|
}
|
|
)
|
|
|
|
when(reset.asBool) {
|
|
(0 until entries).foreach { i =>
|
|
table(i).valid := false.B
|
|
table(i).bits.lanes.foreach { l =>
|
|
l.reqs.foreach { r =>
|
|
r.valid := false.B
|
|
r.source := 0.U
|
|
r.offset := 0.U
|
|
r.sizeEnum := config.sizeEnum.INVALID
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
val full = Wire(Bool())
|
|
full := (0 until entries).map(table(_).valid).reduce(_ && _)
|
|
dontTouch(full)
|
|
|
|
// Enqueue logic
|
|
io.enq.ready := !full
|
|
val enqFire = io.enq.ready && io.enq.valid
|
|
when(enqFire) {
|
|
// TODO: handle enqueueing and looking up the same entry in the same cycle?
|
|
val entryToWrite = table(io.enq.bits.source)
|
|
assert(
|
|
!entryToWrite.valid,
|
|
"tried to enqueue to an already occupied entry"
|
|
)
|
|
entryToWrite.valid := true.B
|
|
entryToWrite.bits := io.enq.bits
|
|
}
|
|
|
|
// Lookup logic
|
|
io.lookup.valid := table(io.lookupSourceId).valid
|
|
io.lookup.bits := table(io.lookupSourceId).bits
|
|
// Dequeue as soon as lookup succeeds
|
|
when(io.lookup.fire) {
|
|
table(io.lookupSourceId).valid := false.B
|
|
}
|
|
|
|
dontTouch(io.lookup)
|
|
}
|
|
|
|
class InflightCoalReqTableEntry(
|
|
val numLanes: Int,
|
|
// Maximum number of requests from a single lane that can get coalesced into a single request
|
|
val numPerLaneReqs: Int,
|
|
val sourceWidth: Int,
|
|
val offsetBits: Int,
|
|
val sizeEnumT: InFlightTableSizeEnum
|
|
) extends Bundle {
|
|
class PerCoreReq extends Bundle {
|
|
val valid = Bool() // FIXME: delete this
|
|
// FIXME: oldId and newId shares the same width
|
|
val source = UInt(sourceWidth.W)
|
|
val offset = UInt(offsetBits.W)
|
|
val sizeEnum = sizeEnumT()
|
|
}
|
|
class PerLane extends Bundle {
|
|
val reqs = Vec(numPerLaneReqs, new PerCoreReq)
|
|
}
|
|
// sourceId of the coalesced response that just came back. This will be the
|
|
// key that queries the table.
|
|
val source = UInt(sourceWidth.W)
|
|
val lanes = Vec(numLanes, new PerLane)
|
|
}
|
|
|
|
object TLUtils {
|
|
def AOpcodeIsStore(opcode: UInt): Bool = {
|
|
// 0: PutFullData, 1: PutPartialData, 4: Get
|
|
assert(
|
|
opcode === TLMessages.PutFullData || opcode === TLMessages.Get,
|
|
"unhandled TL A opcode found"
|
|
)
|
|
Mux(opcode === TLMessages.PutFullData, true.B, false.B)
|
|
}
|
|
def DOpcodeIsStore(opcode: UInt): Bool = {
|
|
assert(
|
|
opcode === TLMessages.AccessAck || opcode === TLMessages.AccessAckData,
|
|
"unhandled TL D opcode found"
|
|
)
|
|
Mux(opcode === TLMessages.AccessAck, true.B, false.B)
|
|
}
|
|
}
|
|
|
|
// `traceHasSource` is true if the input trace file has an additional source
|
|
// ID column. This is useful for using the output trace file genereated by
|
|
// MemTraceLogger as the driver.
|
|
class MemTraceDriver(
|
|
config: CoalescerConfig,
|
|
filename: String,
|
|
traceHasSource: Boolean = false
|
|
)(implicit p: Parameters)
|
|
extends LazyModule {
|
|
// Create N client nodes together
|
|
val laneNodes = Seq.tabulate(config.numLanes) { i =>
|
|
val clientParam = Seq(
|
|
TLMasterParameters.v1(
|
|
name = "MemTraceDriver" + i.toString,
|
|
sourceId = IdRange(0, config.numOldSrcIds)
|
|
// visibility = Seq(AddressSet(0x0000, 0xffffff))
|
|
)
|
|
)
|
|
TLClientNode(Seq(TLMasterPortParameters.v1(clientParam)))
|
|
}
|
|
|
|
// Combine N outgoing client node into 1 idenity node for diplomatic
|
|
// connection.
|
|
val node = TLIdentityNode()
|
|
laneNodes.foreach { l => node := l }
|
|
|
|
lazy val module =
|
|
new MemTraceDriverImp(this, config, filename, traceHasSource)
|
|
}
|
|
|
|
trait HasTraceLine {
|
|
val valid: UInt
|
|
val source: UInt
|
|
val address: UInt
|
|
val is_store: UInt
|
|
val size: UInt
|
|
val data: UInt
|
|
}
|
|
|
|
// Used for both request and response. Response had address set to 0
|
|
// NOTE: these widths have to agree with what's hardcoded in Verilog.
|
|
class TraceLine extends Bundle with HasTraceLine {
|
|
val valid = Bool()
|
|
val source = UInt(32.W)
|
|
val address = UInt(64.W) // FIXME: in Verilog this is the same as data width
|
|
val is_store = Bool()
|
|
val size = UInt(8.W) // this is log2(bytesize) as in TL A bundle
|
|
val data = UInt(64.W)
|
|
}
|
|
|
|
class MemTraceDriverImp(
|
|
outer: MemTraceDriver,
|
|
config: CoalescerConfig,
|
|
filename: String,
|
|
traceHasSource: Boolean
|
|
) extends LazyModuleImp(outer)
|
|
with UnitTestModule {
|
|
// Current cycle mark to read from trace
|
|
val traceReadCycle = RegInit(1.U(64.W))
|
|
|
|
// A decoupling queue to handle backpressure from downstream. We let the
|
|
// downstream take requests from the queue individually for each lane,
|
|
// but do synchronized enqueue whenever all lane queue is ready to prevent
|
|
// drifts between the lane.
|
|
val reqQueues = Seq.fill(config.numLanes)(Module(new Queue(new TraceLine, 2)))
|
|
// Are we safe to read the next warp?
|
|
val reqQueueAllReady = reqQueues.map(_.io.enq.ready).reduce(_ && _)
|
|
|
|
val sim = Module(new SimMemTrace(filename, config.numLanes, traceHasSource))
|
|
sim.io.clock := clock
|
|
sim.io.reset := reset.asBool
|
|
// 'sim.io.trace_ready.ready' is a ready signal going into the DPI sim,
|
|
// indicating this Chisel module is ready to read the next line.
|
|
sim.io.trace_read.ready := reqQueueAllReady
|
|
sim.io.trace_read.cycle := traceReadCycle
|
|
|
|
// Read output from Verilog BlackBox
|
|
// Split output of SimMemTrace, which is flattened across all lanes,back to each lane's.
|
|
val laneReqs = Wire(Vec(config.numLanes, new TraceLine))
|
|
val addrW = laneReqs(0).address.getWidth
|
|
val sizeW = laneReqs(0).size.getWidth
|
|
val dataW = laneReqs(0).data.getWidth
|
|
laneReqs.zipWithIndex.foreach { case (req, i) =>
|
|
req.valid := sim.io.trace_read.valid(i)
|
|
req.source := 0.U // driver trace doesn't contain source id
|
|
req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i)
|
|
req.is_store := sim.io.trace_read.is_store(i)
|
|
req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i)
|
|
req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i)
|
|
}
|
|
|
|
// Not all fire because trace cycle has to advance even when there is no valid
|
|
// line in the trace.
|
|
when(reqQueueAllReady) {
|
|
traceReadCycle := traceReadCycle + 1.U
|
|
}
|
|
|
|
// Enqueue traces to the request queue
|
|
(reqQueues zip laneReqs).foreach { case (reqQ, req) =>
|
|
// Synchronized enqueue
|
|
reqQ.io.enq.valid := reqQueueAllReady && req.valid
|
|
reqQ.io.enq.bits := req // FIXME duplicate valid
|
|
}
|
|
|
|
// Issue here is that Vortex mem range is not within Chipyard Mem range
|
|
// In default setting, all mem-req for program data must be within
|
|
// 0X80000000 -> 0X90000000
|
|
def hashToValidPhyAddr(addr: UInt): UInt = {
|
|
Cat(8.U(4.W), addr(27, 0))
|
|
}
|
|
|
|
// Take requests off of the queue and generate TL requests
|
|
(outer.laneNodes zip reqQueues).foreach { case (node, reqQ) =>
|
|
val (tlOut, edge) = node.out(0)
|
|
|
|
val req = reqQ.io.deq.bits
|
|
// backpressure from downstream propagates into the queue
|
|
reqQ.io.deq.ready := tlOut.a.ready
|
|
|
|
// Core only makes accesses of granularity larger than a word, so we want
|
|
// the trace driver to act so as well.
|
|
// That means if req.size is smaller than word size, we need to pad data
|
|
// with zeros to generate a word-size request, and set mask accordingly.
|
|
val offsetInWord = req.address % config.wordSizeInBytes.U
|
|
val subword = req.size < log2Ceil(config.wordSizeInBytes).U
|
|
|
|
// `mask` is currently unused
|
|
val mask = Wire(UInt(config.wordSizeInBytes.W))
|
|
val wordData = Wire(UInt((config.wordSizeInBytes * 8 * 2).W))
|
|
val sizeInBytes = Wire(UInt((sizeW + 1).W))
|
|
sizeInBytes := (1.U) << req.size
|
|
mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
|
|
wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
|
|
val wordAlignedAddress =
|
|
req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
|
|
val wordAlignedSize = Mux(subword, 2.U, req.size)
|
|
|
|
val sourceGen = Module(
|
|
new RoundRobinSourceGenerator(
|
|
log2Ceil(config.numOldSrcIds),
|
|
ignoreInUse = false
|
|
)
|
|
)
|
|
sourceGen.io.gen := reqQ.io.deq.fire
|
|
// assert(sourceGen.io.id.valid)
|
|
|
|
val (plegal, pbits) = edge.Put(
|
|
fromSource = sourceGen.io.id.bits,
|
|
toAddress = hashToValidPhyAddr(wordAlignedAddress),
|
|
lgSize = wordAlignedSize, // trace line already holds log2(size)
|
|
// data should be aligned to beatBytes
|
|
data =
|
|
(wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
|
|
)
|
|
val (glegal, gbits) = edge.Get(
|
|
fromSource = sourceGen.io.id.bits,
|
|
toAddress = hashToValidPhyAddr(wordAlignedAddress),
|
|
lgSize = wordAlignedSize
|
|
)
|
|
val legal = Mux(req.is_store, plegal, glegal)
|
|
val bits = Mux(req.is_store, pbits, gbits)
|
|
|
|
tlOut.a.valid := (reqQ.io.deq.valid && sourceGen.io.id.valid)
|
|
when(tlOut.a.valid) {
|
|
assert(legal, "illegal TL req gen")
|
|
}
|
|
tlOut.a.bits := bits
|
|
tlOut.b.ready := true.B
|
|
tlOut.c.valid := false.B
|
|
tlOut.d.ready := true.B
|
|
tlOut.e.valid := false.B
|
|
|
|
// Reclaim source id on response
|
|
sourceGen.io.reclaim.valid := tlOut.d.valid
|
|
sourceGen.io.reclaim.bits := tlOut.d.bits.source
|
|
|
|
// debug
|
|
when(tlOut.a.valid) {
|
|
TLPrintf(
|
|
"MemTraceDriver",
|
|
tlOut.a.bits.source,
|
|
tlOut.a.bits.address,
|
|
tlOut.a.bits.size,
|
|
tlOut.a.bits.mask,
|
|
req.is_store,
|
|
tlOut.a.bits.data,
|
|
req.data
|
|
)
|
|
}
|
|
dontTouch(tlOut.a)
|
|
dontTouch(tlOut.d)
|
|
}
|
|
|
|
// Give some slack time after trace EOF to the downstream system to make sure
|
|
// we receive all (hopefully) outstanding responses back.
|
|
val finishCounter = RegInit(200.U(64.W))
|
|
when(sim.io.trace_read.finished) {
|
|
finishCounter := finishCounter - 1.U
|
|
}
|
|
io.finished := (finishCounter === 0.U)
|
|
|
|
when(io.finished) {
|
|
assert(
|
|
false.B,
|
|
"\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)"
|
|
)
|
|
}
|
|
}
|
|
|
|
class SimMemTrace(filename: String, numLanes: Int, traceHasSource: Boolean)
|
|
extends BlackBox(
|
|
Map(
|
|
"FILENAME" -> filename,
|
|
"NUM_LANES" -> numLanes,
|
|
"HAS_SOURCE" -> (if (traceHasSource) 1 else 0)
|
|
)
|
|
)
|
|
with HasBlackBoxResource {
|
|
val traceLineT = new TraceLine
|
|
val addrW = traceLineT.address.getWidth
|
|
val sizeW = traceLineT.size.getWidth
|
|
val dataW = traceLineT.data.getWidth
|
|
|
|
val io = IO(new Bundle {
|
|
val clock = Input(Clock())
|
|
val reset = Input(Bool())
|
|
|
|
// These names have to match declarations in the Verilog code, eg.
|
|
// trace_read_address.
|
|
val trace_read =
|
|
new Bundle { // can't use HasTraceLine because this doesn't have source
|
|
val ready = Input(Bool())
|
|
val valid = Output(UInt(numLanes.W))
|
|
// Chisel can't interface with Verilog 2D port, so flatten all lanes into
|
|
// single wide 1D array.
|
|
// TODO: assumes 64-bit address.
|
|
val cycle = Input(UInt(64.W))
|
|
val address = Output(UInt((addrW * numLanes).W))
|
|
val is_store = Output(UInt(numLanes.W))
|
|
val size = Output(UInt((sizeW * numLanes).W))
|
|
val data = Output(UInt((dataW * numLanes).W))
|
|
val finished = Output(Bool())
|
|
}
|
|
})
|
|
|
|
addResource("/vsrc/SimMemTrace.v")
|
|
addResource("/csrc/SimMemTrace.cc")
|
|
addResource("/csrc/SimMemTrace.h")
|
|
}
|
|
|
|
class MemTraceLogger(
|
|
numLanes: Int,
|
|
// base filename for the generated trace files. full filename will be
|
|
// suffixed depending on `reqEnable`/`respEnable`/`loggerName`.
|
|
filename: String,
|
|
reqEnable: Boolean = true,
|
|
respEnable: Boolean = true,
|
|
// filename suffix that is unique to this logger module.
|
|
// This will be appended to the filename of the generated trace.
|
|
loggerName: String = ".logger"
|
|
)(implicit
|
|
p: Parameters
|
|
) extends LazyModule {
|
|
val node = TLIdentityNode()
|
|
|
|
// val beatBytes = 8 // FIXME: hardcoded
|
|
// val node = TLManagerNode(Seq.tabulate(numLanes) { _ =>
|
|
// TLSlavePortParameters.v1(
|
|
// Seq(
|
|
// TLSlaveParameters.v1(
|
|
// address = List(AddressSet(0x0000, 0xffffff)), // FIXME: hardcoded
|
|
// supportsGet = TransferSizes(1, beatBytes),
|
|
// supportsPutPartial = TransferSizes(1, beatBytes),
|
|
// supportsPutFull = TransferSizes(1, beatBytes)
|
|
// )
|
|
// ),
|
|
// beatBytes = beatBytes
|
|
// )
|
|
// })
|
|
|
|
// Copied from freechips.rocketchip.trailingZeros which only supports Scala
|
|
// integers
|
|
def trailingZeros(x: UInt): UInt = {
|
|
Mux(x === 0.U, x.widthOption.get.U, Log2(x & -x))
|
|
}
|
|
|
|
lazy val module = new Impl
|
|
class Impl extends LazyModuleImp(this) {
|
|
val io = IO(new Bundle {
|
|
val numReqs = Output(UInt(64.W))
|
|
val numResps = Output(UInt(64.W))
|
|
val reqBytes = Output(UInt(64.W))
|
|
val respBytes = Output(UInt(64.W))
|
|
})
|
|
|
|
val numReqs = RegInit(0.U(64.W))
|
|
val numResps = RegInit(0.U(64.W))
|
|
val reqBytes = RegInit(0.U(64.W))
|
|
val respBytes = RegInit(0.U(64.W))
|
|
io.numReqs := numReqs
|
|
io.numResps := numResps
|
|
io.reqBytes := reqBytes
|
|
io.respBytes := respBytes
|
|
|
|
val simReq =
|
|
if (reqEnable)
|
|
Some(Module(new SimMemTraceLogger(false, s"${filename}.${loggerName}.req", numLanes)))
|
|
else None
|
|
val simResp =
|
|
if (respEnable)
|
|
Some(Module(new SimMemTraceLogger(true, s"${filename}.${loggerName}.resp", numLanes)))
|
|
else None
|
|
if (simReq.isDefined) {
|
|
simReq.get.io.clock := clock
|
|
simReq.get.io.reset := reset.asBool
|
|
}
|
|
if (simResp.isDefined) {
|
|
simResp.get.io.clock := clock
|
|
simResp.get.io.reset := reset.asBool
|
|
}
|
|
|
|
val laneReqs = Wire(Vec(numLanes, new TraceLine))
|
|
val laneResps = Wire(Vec(numLanes, new TraceLine))
|
|
|
|
assert(
|
|
numLanes == node.in.length,
|
|
"`numLanes` does not match the number of TL edges connected to the MemTraceLogger"
|
|
)
|
|
|
|
// snoop on the TileLink edges to log traffic
|
|
((node.in zip node.out) zip (laneReqs zip laneResps)).foreach {
|
|
case (((tlIn, _), (tlOut, _)), (req, resp)) =>
|
|
tlOut.a <> tlIn.a
|
|
tlIn.d <> tlOut.d
|
|
|
|
// requests on TL A channel
|
|
//
|
|
// Only log trace when fired, e.g. both upstream and downstream is ready
|
|
// and transaction happened.
|
|
req.valid := tlIn.a.fire
|
|
req.size := tlIn.a.bits.size
|
|
req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
|
|
req.source := tlIn.a.bits.source
|
|
// TL always carries the exact unaligned address that the client
|
|
// originally requested, so no postprocessing required
|
|
req.address := tlIn.a.bits.address
|
|
|
|
when(req.valid) {
|
|
TLPrintf(
|
|
s"MemTraceLogger (${loggerName}:downstream)",
|
|
tlIn.a.bits.source,
|
|
tlIn.a.bits.address,
|
|
tlIn.a.bits.size,
|
|
tlIn.a.bits.mask,
|
|
req.is_store,
|
|
tlIn.a.bits.data,
|
|
req.data
|
|
)
|
|
}
|
|
|
|
// TL data
|
|
//
|
|
// When tlIn.a.bits.size is smaller than the data bus width, need to
|
|
// figure out which byte lanes we actually accessed so that
|
|
// we can write that to the memory trace.
|
|
// See Section 4.5 Byte Lanes in spec 1.8.1
|
|
|
|
// This assert only holds true for PutFullData and not PutPartialData,
|
|
// where HIGH bits in the mask may not be contiguous.
|
|
when(tlIn.a.valid) {
|
|
assert(
|
|
PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
|
|
"mask HIGH popcount do not match the TL size. " +
|
|
"Partial masks are not allowed for PutFull"
|
|
)
|
|
}
|
|
val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
|
|
val dataW = tlIn.params.dataBits
|
|
val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U))
|
|
req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
|
|
// when (req.valid) {
|
|
// printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data)
|
|
// }
|
|
|
|
// responses on TL D channel
|
|
//
|
|
// Only log trace when fired, e.g. both upstream and downstream is ready
|
|
// and transaction happened.
|
|
resp.valid := tlOut.d.fire
|
|
resp.size := tlOut.d.bits.size
|
|
resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode)
|
|
resp.source := tlOut.d.bits.source
|
|
// NOTE: TL D channel doesn't carry address nor mask, so there's no easy
|
|
// way to figure out which bytes the master actually use. Since we
|
|
// don't care too much about addresses in the trace anyway, just store
|
|
// the entire bits.
|
|
resp.address := 0.U
|
|
resp.data := tlOut.d.bits.data
|
|
}
|
|
|
|
// stats
|
|
val numReqsThisCycle =
|
|
laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
|
|
(v0, v1) => v0 + v1
|
|
}
|
|
val numRespsThisCycle =
|
|
laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce {
|
|
(v0, v1) => v0 + v1
|
|
}
|
|
val reqBytesThisCycle =
|
|
laneReqs
|
|
.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
|
|
.reduce { (b0, b1) =>
|
|
b0 + b1
|
|
}
|
|
val respBytesThisCycle =
|
|
laneResps
|
|
.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
|
|
.reduce { (b0, b1) =>
|
|
b0 + b1
|
|
}
|
|
numReqs := numReqs + numReqsThisCycle
|
|
numResps := numResps + numRespsThisCycle
|
|
reqBytes := reqBytes + reqBytesThisCycle
|
|
respBytes := respBytes + respBytesThisCycle
|
|
|
|
// Flatten per-lane signals to the Verilog blackbox input.
|
|
//
|
|
// This is a clunky workaround of the fact that Chisel doesn't allow partial
|
|
// assignment to a bitfield range of a wide signal.
|
|
def flattenTrace(
|
|
simIO: Bundle with HasTraceLine,
|
|
perLane: Vec[TraceLine]
|
|
) = {
|
|
// these will get optimized out
|
|
val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
|
|
val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
|
|
val vecAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address)))
|
|
val vecIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store)))
|
|
val vecSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size)))
|
|
val vecData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data)))
|
|
perLane.zipWithIndex.foreach { case (l, i) =>
|
|
vecValid(i) := l.valid
|
|
vecSource(i) := l.source
|
|
vecAddress(i) := l.address
|
|
vecIsStore(i) := l.is_store
|
|
vecSize(i) := l.size
|
|
vecData(i) := l.data
|
|
}
|
|
simIO.valid := vecValid.asUInt
|
|
simIO.source := vecSource.asUInt
|
|
simIO.address := vecAddress.asUInt
|
|
simIO.is_store := vecIsStore.asUInt
|
|
simIO.size := vecSize.asUInt
|
|
simIO.data := vecData.asUInt
|
|
}
|
|
|
|
if (simReq.isDefined) {
|
|
flattenTrace(simReq.get.io.trace_log, laneReqs)
|
|
assert(
|
|
simReq.get.io.trace_log.ready === true.B,
|
|
"MemTraceLogger is expected to be always ready"
|
|
)
|
|
}
|
|
if (simResp.isDefined) {
|
|
flattenTrace(simResp.get.io.trace_log, laneResps)
|
|
assert(
|
|
simResp.get.io.trace_log.ready === true.B,
|
|
"MemTraceLogger is expected to be always ready"
|
|
)
|
|
}
|
|
}
|
|
}
|
|
|
|
// MemTraceLogger is bidirectional, and `isResponse` is how the DPI module tells
|
|
// itself whether it's logging the request stream or the response stream. This
|
|
// is necessary because we have to generate slightly different trace format
|
|
// depending on this, e.g. response trace will not contain an address column.
|
|
class SimMemTraceLogger(isResponse: Boolean, filename: String, numLanes: Int)
|
|
extends BlackBox(
|
|
Map(
|
|
"IS_RESPONSE" -> (if (isResponse) 1 else 0),
|
|
"FILENAME" -> filename,
|
|
"NUM_LANES" -> numLanes
|
|
)
|
|
)
|
|
with HasBlackBoxResource {
|
|
val traceLineT = new TraceLine
|
|
val sourceW = traceLineT.source.getWidth
|
|
val addrW = traceLineT.address.getWidth
|
|
val sizeW = traceLineT.size.getWidth
|
|
val dataW = traceLineT.data.getWidth
|
|
|
|
val io = IO(new Bundle {
|
|
val clock = Input(Clock())
|
|
val reset = Input(Bool())
|
|
|
|
val trace_log = new Bundle with HasTraceLine {
|
|
val valid = Input(UInt(numLanes.W))
|
|
val source = Input(UInt((sourceW * numLanes).W))
|
|
// Chisel can't interface with Verilog 2D port, so flatten all lanes into
|
|
// single wide 1D array.
|
|
// TODO: assumes 64-bit address.
|
|
val address = Input(UInt((addrW * numLanes).W))
|
|
val is_store = Input(UInt(numLanes.W))
|
|
val size = Input(UInt((sizeW * numLanes).W))
|
|
val data = Input(UInt((dataW * numLanes).W))
|
|
val ready = Output(Bool())
|
|
}
|
|
})
|
|
|
|
addResource("/vsrc/SimMemTraceLogger.v")
|
|
addResource("/csrc/SimMemTraceLogger.cc")
|
|
addResource("/csrc/SimMemTrace.h")
|
|
}
|
|
|
|
class TLPrintf {}
|
|
|
|
object TLPrintf {
|
|
def apply(
|
|
printer: String,
|
|
source: UInt,
|
|
address: UInt,
|
|
size: UInt,
|
|
mask: UInt,
|
|
is_store: Bool,
|
|
tlData: UInt,
|
|
reqData: UInt
|
|
) = {
|
|
printf(
|
|
s"${printer}: TL source=%d, addr=%x, size=%d, mask=%x, store=%d",
|
|
source,
|
|
address,
|
|
size,
|
|
mask,
|
|
is_store
|
|
)
|
|
when(is_store) {
|
|
printf(", tlData=%x, reqData=%x", tlData, reqData)
|
|
}
|
|
printf("\n")
|
|
}
|
|
}
|
|
|
|
// Synthesizable unit tests
|
|
|
|
class DummyDriver(config: CoalescerConfig)(implicit p: Parameters)
|
|
extends LazyModule {
|
|
val laneNodes = Seq.tabulate(config.numLanes) { i =>
|
|
val clientParam = Seq(
|
|
TLMasterParameters.v1(
|
|
name = "dummy-core-node-" + i.toString,
|
|
sourceId = IdRange(0, config.numOldSrcIds)
|
|
// visibility = Seq(AddressSet(0x0000, 0xffffff))
|
|
)
|
|
)
|
|
TLClientNode(Seq(TLMasterPortParameters.v1(clientParam)))
|
|
}
|
|
|
|
// Combine N outgoing client node into 1 idenity node for diplomatic
|
|
// connection.
|
|
val node = TLIdentityNode()
|
|
laneNodes.foreach { l => node := l }
|
|
|
|
lazy val module = new DummyDriverImp(this, config)
|
|
}
|
|
|
|
class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
|
|
extends LazyModuleImp(outer)
|
|
with UnitTestModule {
|
|
val sourceIdCounter = RegInit(0.U(log2Ceil(config.numOldSrcIds).W))
|
|
sourceIdCounter := sourceIdCounter + 1.U
|
|
|
|
val finishCounter = RegInit(10000.U(64.W))
|
|
finishCounter := finishCounter - 1.U
|
|
io.finished := (finishCounter === 0.U)
|
|
|
|
outer.laneNodes.zipWithIndex.foreach { case (node, lane) =>
|
|
assert(node.out.length == 1)
|
|
|
|
// generate dummy traffic to coalescer to prevent it from being optimized
|
|
// out during synthesis
|
|
val address = Wire(UInt(config.addressWidth.W))
|
|
address := Cat(
|
|
(finishCounter + (lane.U % 3.U)),
|
|
0.U(config.wordSizeWidth.W)
|
|
)
|
|
val (tl, edge) = node.out(0)
|
|
val (legal, bits) = edge.Put(
|
|
fromSource = sourceIdCounter,
|
|
toAddress = address,
|
|
lgSize = 2.U,
|
|
data = finishCounter + (lane.U % 3.U)
|
|
)
|
|
assert(legal, "illegal TL req gen")
|
|
tl.a.valid := true.B
|
|
tl.a.bits := bits
|
|
tl.b.ready := true.B
|
|
tl.c.valid := false.B
|
|
tl.d.ready := true.B
|
|
tl.e.valid := false.B
|
|
}
|
|
|
|
val dataSum = outer.laneNodes
|
|
.map { node =>
|
|
val tl = node.out(0)._1
|
|
val data = Mux(tl.d.valid, tl.d.bits.data, 0.U)
|
|
data
|
|
}
|
|
.reduce(_ +& _)
|
|
// this doesn't make much sense, but it prevents the entire uncoalescer from
|
|
// being optimized away
|
|
finishCounter := finishCounter + dataSum
|
|
}
|
|
|
|
// A dummy harness around the coalescer for use in VLSI flow.
|
|
// Should not instantiate any memtrace modules.
|
|
class DummyCoalescer(implicit p: Parameters) extends LazyModule {
|
|
val numLanes = p(SIMTCoreKey).get.nLanes
|
|
println(s"============ numLanes: ${numLanes}")
|
|
val config = defaultConfig.copy(numLanes = numLanes)
|
|
|
|
val driver = LazyModule(new DummyDriver(config))
|
|
val rams = Seq.fill(config.numLanes + 1)( // +1 for coalesced edge
|
|
LazyModule(
|
|
// NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
|
|
// edges globally, by way of Diplomacy communicating the TL slave
|
|
// parameters to the upstream nodes.
|
|
new TLRAM(
|
|
address = AddressSet(0x0000, 0xffffffff),
|
|
beatBytes = (1 << config.dataBusWidth)
|
|
)
|
|
)
|
|
)
|
|
|
|
val coal = LazyModule(new CoalescingUnit(config))
|
|
|
|
coal.cpuNode :=* driver.node
|
|
rams.foreach(_.node := coal.aggregateNode)
|
|
|
|
lazy val module = new Impl
|
|
class Impl extends LazyModuleImp(this) with UnitTestModule {
|
|
io.finished := driver.module.io.finished
|
|
}
|
|
}
|
|
|
|
class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
|
|
extends UnitTest(timeout) {
|
|
val dut = Module(LazyModule(new DummyCoalescer).module)
|
|
dut.io.start := io.start
|
|
io.finished := dut.io.finished
|
|
}
|
|
|
|
// tracedriver --> coalescer --> tracelogger --> tlram
|
|
class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters)
|
|
extends LazyModule {
|
|
val numLanes = p(SIMTCoreKey).get.nLanes
|
|
val config = defaultConfig.copy(numLanes = numLanes)
|
|
|
|
val driver = LazyModule(new MemTraceDriver(config, filename))
|
|
val coreSideLogger = LazyModule(
|
|
new MemTraceLogger(numLanes, filename, loggerName = "coreside")
|
|
)
|
|
val coal = LazyModule(new CoalescingUnit(config))
|
|
val memSideLogger = LazyModule(
|
|
new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")
|
|
)
|
|
val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
|
|
LazyModule(
|
|
// NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
|
|
// edges globally, by way of Diplomacy communicating the TL slave
|
|
// parameters to the upstream nodes.
|
|
new TLRAM(
|
|
address = AddressSet(0x0000, 0xffffffff),
|
|
beatBytes = (1 << config.dataBusWidth)
|
|
)
|
|
)
|
|
)
|
|
|
|
memSideLogger.node :=* coal.aggregateNode
|
|
coal.cpuNode :=* coreSideLogger.node :=* driver.node
|
|
rams.foreach { r => r.node := memSideLogger.node }
|
|
|
|
lazy val module = new Impl
|
|
class Impl extends LazyModuleImp(this) with UnitTestModule {
|
|
driver.module.io.start := io.start
|
|
io.finished := driver.module.io.finished
|
|
|
|
when(io.finished) {
|
|
printf(
|
|
"numReqs=%d, numResps=%d, reqBytes=%d, respBytes=%d\n",
|
|
coreSideLogger.module.io.numReqs,
|
|
coreSideLogger.module.io.numResps,
|
|
coreSideLogger.module.io.reqBytes,
|
|
coreSideLogger.module.io.respBytes
|
|
)
|
|
assert(
|
|
(coreSideLogger.module.io.numReqs === coreSideLogger.module.io.numResps) &&
|
|
(coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes),
|
|
"FAIL: requests and responses traffic to the coalescer do not match"
|
|
)
|
|
printf("SUCCESS: coalescer response traffic matched requests!\n")
|
|
}
|
|
}
|
|
}
|
|
|
|
class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit
|
|
p: Parameters
|
|
) extends UnitTest(timeout) {
|
|
val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
|
|
dut.io.start := io.start
|
|
io.finished := dut.io.finished
|
|
}
|
|
|
|
// tracedriver --> coalescer --> tlram
|
|
class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
|
|
// TODO: use parameters for numLanes
|
|
val numLanes = 4
|
|
val filename = "vecadd.core1.thread4.trace"
|
|
val coal = LazyModule(new CoalescingUnit(defaultConfig))
|
|
val driver = LazyModule(new MemTraceDriver(defaultConfig, filename))
|
|
val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge
|
|
LazyModule(
|
|
// NOTE: beatBytes here sets the data bitwidth of the upstream TileLink
|
|
// edges globally, by way of Diplomacy communicating the TL slave
|
|
// parameters to the upstream nodes.
|
|
new TLRAM(
|
|
address = AddressSet(0x0000, 0xffffffff),
|
|
beatBytes = (1 << defaultConfig.dataBusWidth)
|
|
)
|
|
)
|
|
)
|
|
|
|
coal.cpuNode :=* driver.node
|
|
rams.foreach { r => r.node := coal.aggregateNode }
|
|
|
|
lazy val module = new Impl
|
|
class Impl extends LazyModuleImp(this) with UnitTestModule {
|
|
driver.module.io.start := io.start
|
|
io.finished := driver.module.io.finished
|
|
}
|
|
}
|
|
|
|
class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
|
|
extends UnitTest(timeout) {
|
|
val dut = Module(LazyModule(new TLRAMCoalescer).module)
|
|
dut.io.start := io.start
|
|
io.finished := dut.io.finished
|
|
}
|
|
|
|
////////////
|
|
////////////
|
|
////////////
|
|
//////////// Code for CoalescerXbar
|
|
////////////
|
|
////////////
|
|
|
|
// Lazy Module is needed to instantiate outgoing node
|
|
class CoalescerXbar(config: CoalescerConfig) (implicit p: Parameters) extends LazyModule {
|
|
// Let SIMT's word size be 32, and read/write granularity be 256
|
|
|
|
|
|
// 32 client nodes of edge size 32 for non-coalesced reqs
|
|
// And attaching them wigets
|
|
val nonCoalNarrowNodes = Seq.tabulate(config.numLanes){i =>
|
|
val nonCoalNarrowParam = Seq(
|
|
TLMasterParameters.v1(
|
|
name = "NonCoalNarrowNode" + i.toString,
|
|
sourceId = IdRange(0, config.numOldSrcIds)
|
|
)
|
|
)
|
|
TLClientNode(Seq(TLMasterPortParameters.v1(nonCoalNarrowParam)))
|
|
}
|
|
val nonCoalWidgets = Seq.tabulate(config.numLanes){ _=>
|
|
TLWidthWidget(config.wordSizeInBytes)
|
|
}
|
|
|
|
(nonCoalWidgets zip nonCoalNarrowNodes).foreach{
|
|
case(wgt,node)=> wgt := node
|
|
}
|
|
|
|
//Creating a round robin cross tilelink xbar for the un-coalesced
|
|
//and connect them to the widgets
|
|
val nonCoalXbar = LazyModule(new TLXbar(TLArbiter.roundRobin))
|
|
nonCoalWidgets.foreach{nonCoalXbar.node:=_}
|
|
|
|
|
|
|
|
// K client nodes of edge size 256 for the coalesced reqs
|
|
val coalReqNodes = Seq.tabulate(config.numCoalReqs){ i =>
|
|
val coalParam = Seq(
|
|
TLMasterParameters.v1(
|
|
name = "CoalReqNode" + i.toString,
|
|
sourceId = IdRange(0, config.numNewSrcIds)
|
|
)
|
|
)
|
|
TLClientNode(Seq(TLMasterPortParameters.v1(coalParam)))
|
|
}
|
|
// Create a RR Xbar for the coalesced request
|
|
val coalXbar = LazyModule(new TLXbar(TLArbiter.roundRobin))
|
|
coalReqNodes.foreach{coalXbar.node:=_}
|
|
|
|
//Create a Priority XBar between Coalesced and Uncoalesced Request
|
|
val outputXbar = LazyModule(new TLXbar(TLArbiter.lowestIndexFirst))
|
|
outputXbar.node :=* coalXbar.node
|
|
outputXbar.node :=* nonCoalXbar.node
|
|
|
|
//express output crossbar as an idenity node for simpler downstream connection
|
|
val node = TLIdentityNode()
|
|
node :=* outputXbar.node
|
|
|
|
val nonCoalEntryT = new NonCoalescedRequest(config)
|
|
val coalEntryT = new CoalescedRequest(config)
|
|
val respNonCoalEntryT = new NonCoalescedResponse(config)
|
|
val respCoalBundleT = new CoalescedResponse(config)
|
|
|
|
lazy val module = new CoalescerXbarImpl(
|
|
this, config, nonCoalEntryT, coalEntryT, respNonCoalEntryT, respCoalBundleT)
|
|
|
|
|
|
|
|
}
|
|
|
|
class CoalescerXbarImpl(outer: CoalescerXbar,
|
|
config: CoalescerConfig,
|
|
nonCoalEntryT: Request,
|
|
coalEntryT: Request,
|
|
respNonCoalEntryT: Response,
|
|
respCoalBundleT: CoalescedResponse
|
|
) extends LazyModuleImp(outer){
|
|
|
|
|
|
val io = IO(new Bundle {
|
|
val nonCoalReqs = Vec(config.numLanes, Flipped(Decoupled(nonCoalEntryT)))
|
|
val coalReqs = Vec(config.numCoalReqs, Flipped(Decoupled(coalEntryT)))
|
|
val nonCoalResps = Vec(config.numLanes, Decoupled(respNonCoalEntryT))
|
|
val coalResp = Decoupled(respCoalBundleT)
|
|
}
|
|
)
|
|
|
|
//Create Queues to receive data from upstream
|
|
//Stage 1: Create Queue for nonCoalReqs and CoalReqs
|
|
val nonCoalReqsQueues = Seq.tabulate(config.numLanes){_=>
|
|
Module(new Queue(nonCoalEntryT.cloneType, 1, true, false))
|
|
}
|
|
val coalReqsQueues = Seq.tabulate(config.numCoalReqs){_=>
|
|
Module(new Queue(coalEntryT.cloneType, 1, true, false))
|
|
}
|
|
//Stage 1a: connect two Queue groups to the input
|
|
(io.nonCoalReqs++io.coalReqs zip nonCoalReqsQueues++coalReqsQueues).foreach{
|
|
case (req, q) => q.io.enq <> req
|
|
}
|
|
|
|
//Stage 2: connect output of the queue to the respective Node
|
|
(nonCoalReqsQueues++coalReqsQueues zip outer.nonCoalNarrowNodes++outer.coalReqNodes).foreach{
|
|
case(q, node) =>
|
|
val (tlOut, edgeOut) = node.out(0)
|
|
q.io.deq.ready := tlOut.a.ready
|
|
tlOut.a.valid := q.io.deq.valid
|
|
tlOut.a.bits := q.io.deq.bits.toTLA(edgeOut)
|
|
}
|
|
//The XBar will take care of the rest
|
|
|
|
|
|
//
|
|
// Inward data handling
|
|
//
|
|
|
|
// For the uncoalesced data response
|
|
(outer.nonCoalNarrowNodes zip io.nonCoalResps).foreach{
|
|
case(node,resp) =>
|
|
val (tlOut, edgeOut) = node.out(0)
|
|
val nonCoalResp = Wire(respNonCoalEntryT)
|
|
nonCoalResp.fromTLD(tlOut.d.bits)
|
|
tlOut.d.ready := resp.ready
|
|
resp.valid := tlOut.d.valid
|
|
resp.bits := nonCoalResp
|
|
}
|
|
|
|
//For the coalesced data response
|
|
//Have an RR arbiter that holds the response data
|
|
val coalRespRRArbiter = Module(new RRArbiter(
|
|
outer.node.in(0)._1.d.bits.cloneType,
|
|
config.numCoalReqs)
|
|
)
|
|
outer.coalReqNodes.zipWithIndex.foreach{
|
|
case(node, idx) =>
|
|
val (tlOut, edgeOut) = node.out(0)
|
|
coalRespRRArbiter.io.in(idx) <> tlOut.d
|
|
}
|
|
//Connect output of arbiter to coalesced reponse output
|
|
io.coalResp.valid := coalRespRRArbiter.io.out.valid
|
|
coalRespRRArbiter.io.out.ready := io.coalResp.ready
|
|
val coalRespBundle = Wire(respCoalBundleT)
|
|
coalRespBundle.fromTLD(coalRespRRArbiter.io.out.bits)
|
|
io.coalResp.bits := coalRespBundle
|
|
|
|
|
|
}
|