Files
radiance/src/main/scala/tilelink/Coalescing.scala
2023-03-27 01:47:45 -07:00

553 lines
17 KiB
Scala

// See LICENSE.SiFive for license details.
package freechips.rocketchip.tilelink
import chisel3._
import chisel3.util._
import freechips.rocketchip.config.Parameters
import freechips.rocketchip.diplomacy._
// import freechips.rocketchip.devices.tilelink.TLTestRAM
import freechips.rocketchip.util.{ShiftQueue, MultiPortQueue}
import freechips.rocketchip.unittest._
class CoalescingUnit(numLanes: Int = 1)(implicit p: Parameters)
extends LazyModule {
// Identity node that captures the incoming TL requests and passes them
// through the other end, dropping coalesced requests. This node is what
// will be visible to upstream and downstream nodes.
val node = TLIdentityNode()
// Number of maximum in-flight coalesced requests. The upper bound of this
// value would be the sourceId range of a single lane.
val numInflightCoalRequests = 4
// Master node that actually generates coalesced requests.
protected val coalParam = Seq(
TLMasterParameters.v1(
name = "CoalescerNode",
sourceId = IdRange(0, numInflightCoalRequests)
)
)
val coalescerNode = TLClientNode(
Seq(TLMasterPortParameters.v1(coalParam))
)
// Connect master node as the first inward edge of the IdentityNode
node :=* coalescerNode
lazy val module = new CoalescingUnitImp(this, numLanes)
}
class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int)
extends LazyModuleImp(outer) {
class ReqQueueEntry(val sourceWidth: Int, val addressWidth: Int)
extends Bundle {
val source = UInt(sourceWidth.W)
val address = UInt(addressWidth.W)
val data = UInt(64.W /* FIXME hardcoded */ ) // write data
}
class RespQueueEntry(val sourceWidth: Int) extends Bundle {
val source = UInt(sourceWidth.W)
val data = UInt(64.W /* FIXME hardcoded */ ) // read data
}
// node.in(0) is from coalescer TL master node; 1~N are from cores
// assert(node.in.length >= 2)
val sourceWidth = outer.node.in(1)._1.params.sourceBits
val addressWidth = outer.node.in(1)._1.params.addressBits
val reqQueueEntryT = new ReqQueueEntry(sourceWidth, addressWidth)
val reqQueues = Seq.tabulate(numLanes) { _ =>
Module(
new ShiftQueue(reqQueueEntryT, 4 /* FIXME hardcoded */ )
)
}
val respQueueEntryT = new RespQueueEntry(sourceWidth)
val respQueues = Seq.tabulate(numLanes) { _ =>
// Module(
// new ShiftQueue(respQueueEntryT, 8 /* FIXME depth hardcoded */ )
// )
Module(
new MultiPortQueue(
respQueueEntryT,
// enq_lanes = 1 + M, where 1 is the response for the original per-lane
// requests that didn't get coalesced, and M is the number of coalescer
// nodes.
2,
// deq_lanes = 1 because we're serializing all responses to 1 port that
// goes back to the core.
1,
2,
4 /* FIXME depth hardcoded */
)
)
}
// Port 0: from original responses
// Port 1~M: from M coalescer nodes
val respQueueCoalPortOffset = 1
// Per-lane request and response queues
//
// Override IdentityNode implementation so that we can instantiate
// queues between input and output edges to buffer requests and responses.
// See IdentityNode definition in `diplomacy/Nodes.scala`.
(outer.node.in zip outer.node.out).zipWithIndex.foreach {
case (((_, edgeIn), _), 0) =>
// No need to do anything on the edge from coalescerNode
assert(
edgeIn.master.masters(0).name == "CoalescerNode",
"First edge is not connected to the coalescer master node"
)
case (((tlIn, edgeIn), (tlOut, edgeOut)), i) =>
// Request queue
//
val reqQueue = reqQueues(i - 1)
val req = Wire(reqQueueEntryT)
req.source := tlIn.a.bits.source
req.address := tlIn.a.bits.address
req.data := tlIn.a.bits.data
println(s"============ req.source width=${req.source.widthOption.get}")
reqQueue.io.enq.valid := tlIn.a.valid
reqQueue.io.enq.bits := req
// TODO: deq.ready should respect downstream ready
reqQueue.io.deq.ready := true.B
tlOut.a.valid := reqQueue.io.deq.valid
val reqHead = reqQueue.io.deq.bits
// FIXME: generate Get or Put according to read/write
val (reqLegal, reqBits) = edgeOut.Get(
fromSource = reqHead.source,
// `toAddress` should be aligned to 2**lgSize
toAddress = reqHead.address,
lgSize = 0.U
)
assert(reqLegal, "unhandled illegal TL req gen")
tlOut.a.bits := reqBits
// Response queue
//
// This queue will serialize non-coalesced responses along with
// coalesced responses and serve them back to the core side.
val respQueue = respQueues(i - 1)
val resp = Wire(respQueueEntryT)
resp.source := tlOut.d.bits.source
resp.data := tlOut.d.bits.data
// Originally non-coalesced responses. Coalesced (but split) responses
// will also be enqueued into the same queue.
respQueue.io.enq(0).valid := tlOut.d.valid
respQueue.io.enq(0).bits := resp
// TODO: deq.ready should respect upstream ready
respQueue.io.deq(0).ready := true.B
tlIn.d.valid := respQueue.io.deq(0).valid
val respHead = respQueue.io.deq(0).bits
val respBits = edgeIn.AccessAck(
toSource = respHead.source,
lgSize = 0.U,
data = respHead.data
)
tlIn.d.bits := respBits
// tlIn.d <> tlOut.d
// Debug only
val inflightCounter = RegInit(UInt(32.W), 0.U)
when(tlOut.a.valid) {
// don't inc/dec on simultaneous req/resp
when(!tlOut.d.valid) {
inflightCounter := inflightCounter + 1.U
}
}.elsewhen(tlOut.d.valid) {
inflightCounter := inflightCounter - 1.U
}
dontTouch(inflightCounter)
dontTouch(tlIn.a)
dontTouch(tlIn.d)
dontTouch(tlOut.a)
dontTouch(tlOut.d)
}
// Generate coalesced requests
// FIXME: currently generating bogus coalesced requests
val coalSourceId = RegInit(0.U(2.W /* FIXME hardcoded */ ))
coalSourceId := coalSourceId + 1.U
val (tlCoal, edgeCoal) = outer.coalescerNode.out(0)
val coalReqAddress = Wire(UInt(tlCoal.params.addressBits.W))
// TODO: bogus address
coalReqAddress := (0xabcd.U + coalSourceId) << 4
val coalReqValid = Wire(Bool())
// FIXME: copy lane 1's valid signal. This is completely bogus
coalReqValid := outer.node.in(1)._1.a.valid
val (legal, bits) = edgeCoal.Get(
fromSource = coalSourceId,
// `toAddress` should be aligned to 2**lgSize
toAddress = coalReqAddress,
// 64 bits = 8 bytes = 2**(3) bytes
lgSize = 3.U
)
assert(legal, "unhandled illegal TL req gen")
tlCoal.a.valid := coalReqValid
tlCoal.a.bits := bits
tlCoal.b.ready := true.B
tlCoal.c.valid := false.B
tlCoal.d.ready := true.B
tlCoal.e.valid := false.B
// Construct new entry for the inflight table
val inflightTable = Module(
new InflightCoalReqTable(
numLanes,
sourceWidth,
outer.numInflightCoalRequests
)
)
val newEntry = Wire(inflightTable.entryT)
newEntry.respSourceId := coalSourceId
newEntry.lanes.foreach { l =>
l.valid := false.B
l.reqs.foreach { r =>
// TODO: this part needs the actual coalescing logic to work
r.valid := true.B
r.offset := 1.U
r.size := 2.U
}
}
newEntry.lanes(0).valid := true.B
newEntry.lanes(2).valid := true.B
dontTouch(newEntry)
// Populate inflight coalesced request table
inflightTable.io.enq.valid := coalReqValid
inflightTable.io.enq.bits := newEntry
// Look up the table with incoming coalesced responses
inflightTable.io.lookup.ready := tlCoal.d.valid
inflightTable.io.lookupSourceId := tlCoal.d.bits.source
val coalRespData = Wire(UInt(tlCoal.params.dataBits.W))
coalRespData := tlCoal.d.bits.data
val found = inflightTable.io.lookup.bits
found.lanes.zipWithIndex.foreach { case (l, i) =>
val respQueue = respQueues(i)
respQueue.io.enq(respQueueCoalPortOffset).valid := false.B
respQueue.io.enq(respQueueCoalPortOffset).bits := DontCare
when(inflightTable.io.lookup.valid) {
respQueue.io.enq(respQueueCoalPortOffset).valid := l.valid
// FIXME: only looking at 0th entry
respQueue.io.enq(respQueueCoalPortOffset).bits.source := 0.U
// FIXME: disregard size enum for now
val sizeMask = (1.U << 4) - 1.U
val dataWidth = tlCoal.params.dataBits
respQueue.io.enq(respQueueCoalPortOffset).bits.data :=
(coalRespData >> (dataWidth - 4)) & sizeMask
}
when(l.valid) {
when(l.reqs(0).valid) {
printf(s"lane ${i} req 0 is valid!\n")
}
}
}
(outer.node.in zip outer.node.out)(0) match {
case ((tlIn, edgeIn), (tlOut, _)) =>
assert(
edgeIn.master.masters.length == 1 &&
edgeIn.master.masters(0).name == "CoalescerNode",
"First edge is not connected to the coalescer master node"
)
// TODO: do we need to do anything here?
tlOut.a <> tlIn.a
tlIn.d <> tlOut.d
dontTouch(tlIn.d)
dontTouch(tlOut.d)
}
// Debug
dontTouch(coalReqValid)
dontTouch(coalReqAddress)
dontTouch(coalRespData)
dontTouch(tlCoal.a)
dontTouch(tlCoal.d)
}
// InflightCoalReqTable is a table structure that records
// for each unanswered coalesced request which lane the request originated
// from, what their original TileLink sourceId were, etc. We use this info to
// split the coalesced response back to individual per-lane responses with the
// right metadata.
class InflightCoalReqTable(
val numLanes: Int,
val sourceWidth: Int,
val entries: Int
) extends Module {
val offsetBits = 4 // FIXME hardcoded
val sizeBits = 2 // FIXME hardcoded
val entryT =
new InflightCoalReqTableEntry(numLanes, sourceWidth, offsetBits, sizeBits)
val io = IO(new Bundle {
val enq = Flipped(Decoupled(entryT))
// TODO: return actual stuff
val lookup = Decoupled(entryT)
// TODO: put this inside decoupledIO
val lookupSourceId = Input(UInt(sourceWidth.W))
})
val table = Mem(
entries,
new Bundle {
val valid = Bool()
val bits = new InflightCoalReqTableEntry(
numLanes,
sourceWidth,
offsetBits,
sizeBits
)
}
)
when(reset.asBool) {
(0 until entries).foreach { i =>
table(i).valid := false.B
table(i).bits.lanes.foreach { l =>
l.valid := false.B
l.reqs.foreach { r =>
r.offset := 0.U
r.size := 0.U
}
}
}
}
val full = Wire(Bool())
full := (0 until entries)
.map { i => table(i).valid }
.reduce { (v0, v1) => v0 && v1 }
// Inflight table should never be full. It should have enough number of
// entries to keep track of all outstanding core-side requests; otherwise,
// it will stall the core issuing logic.
assert(!full, "table is blocking coalescer")
dontTouch(full)
// Enqueue logic
//
io.enq.ready := !full
val enqFire = io.enq.ready && io.enq.valid
when(enqFire) {
// TODO: handle enqueueing and looking up the same entry in the same cycle?
val entryToWrite = table(io.enq.bits.respSourceId)
assert(
!entryToWrite.valid,
"tried to enqueue to an already occupied entry"
)
entryToWrite.valid := true.B
entryToWrite.bits := io.enq.bits
}
// Lookup logic
//
io.lookup.valid := table(io.lookupSourceId).valid
io.lookup.bits := table(io.lookupSourceId).bits
val lookupFire = io.lookup.ready && io.lookup.valid
// Dequeue as soon as lookup succeeds
when(lookupFire) {
table(io.lookupSourceId).valid := false.B
}
dontTouch(io.lookup)
}
class InflightCoalReqTableEntry(
val numLanes: Int,
val sourceWidth: Int,
val offsetBits: Int,
val sizeBits: Int
) extends Bundle {
class CoreReq extends Bundle {
val valid = Bool()
val offset = UInt(offsetBits.W)
val size = UInt(sizeBits.W)
}
class PerLane extends Bundle {
val valid = Bool()
// srcId is positionally encoded
val reqs = Vec(1 << sourceWidth, new CoreReq)
}
// sourceId of the coalesced response that just came back. This will be the
// key that queries the table.
val respSourceId = UInt(sourceWidth.W)
val lanes = Vec(numLanes, new PerLane)
}
class MemTraceDriver(numLanes: Int = 1)(implicit p: Parameters)
extends LazyModule {
// Create N client nodes together
val laneNodes = Seq.tabulate(numLanes) { i =>
val clientParam = Seq(
TLMasterParameters.v1(
name = "MemTraceDriver" + i.toString,
sourceId = IdRange(0, 0x10)
// visibility = Seq(AddressSet(0x0000, 0xffffff))
)
)
TLClientNode(Seq(TLMasterPortParameters.v1(clientParam)))
}
// Combine N outgoing client node into 1 idenity node for diplomatic
// connection.
val node = TLIdentityNode()
laneNodes.foreach { l => node := l }
lazy val module = new MemTraceDriverImp(this, numLanes)
}
class TraceReq extends Bundle {
val valid = Bool()
val address = UInt(64.W)
val is_store = Bool()
val mask = UInt(8.W)
val data = UInt(64.W)
}
class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int)
extends LazyModuleImp(outer)
with UnitTestModule {
val sim = Module(
new SimMemTrace(filename = "vecadd.core1.thread4.trace", numLanes)
)
sim.io.clock := clock
sim.io.reset := reset.asBool
sim.io.trace_read.ready := true.B
// Split output of SimMemTrace, which is flattened across all lanes,
// back to each lane's.
// Maybe this part can be improved, since now we are still mannually shifting everything
val laneReqs = Wire(Vec(numLanes, new TraceReq))
laneReqs.zipWithIndex.foreach { case (req, i) =>
req.valid := (sim.io.trace_read.valid >> i)
req.address := (sim.io.trace_read.address >> (64 * i))
req.is_store := (sim.io.trace_read.is_store >> i)
req.mask := (sim.io.trace_read.store_mask >> (8 * i))
req.data := (sim.io.trace_read.data >> (64 * i))
}
// To prevent collision of sourceId with a current in-flight message,
// just use a counter that increments indefinitely as the sourceId of new
// messages.
val sourceIdCounter = RegInit(0.U(64.W))
sourceIdCounter := sourceIdCounter + 1.U
// Connect each lane to its respective TL node.
(outer.laneNodes zip laneReqs).foreach { case (node, req) =>
val (tlOut, edge) = node.out(0)
tlOut.a.valid := req.valid
val (plegal, pbits) = edge.Put(
fromSource = sourceIdCounter,
toAddress = req.address,
// Memory trace addresses are not necessarily aligned to word boundaries
// so leave lgSize to 0
// NOTE: this is in bytes not bits
lgSize = 0.U,
data = req.data
)
val (glegal, gbits) = edge.Get(
fromSource = sourceIdCounter,
toAddress = req.address,
lgSize = 0.U
)
val legal = Mux(req.is_store, plegal, glegal)
val bits = Mux(req.is_store, pbits, gbits)
assert(legal, "unhandled illegal TL req gen")
tlOut.a.bits := bits
tlOut.b.ready := true.B
tlOut.c.valid := false.B
tlOut.d.ready := true.B
tlOut.e.valid := false.B
dontTouch(tlOut.a)
}
io.finished := sim.io.trace_read.finished
// Clock Counter, for debugging purpose
val clkcount = RegInit(0.U(64.W))
clkcount := clkcount + 1.U
dontTouch(clkcount)
}
class SimMemTrace(val filename: String, numLanes: Int)
extends BlackBox(
Map("FILENAME" -> filename, "NUM_LANES" -> numLanes)
)
with HasBlackBoxResource {
val io = IO(new Bundle {
val clock = Input(Clock())
val reset = Input(Bool())
// These names have to match declarations in the Verilog code, eg.
// trace_read_address.
val trace_read = new Bundle {
val ready = Input(Bool())
val valid = Output(UInt(numLanes.W))
// Chisel can't interface with Verilog 2D port, so flatten all lanes into
// single wide 1D array.
// TODO: assumes 64-bit address.
val address = Output(UInt((64 * numLanes).W))
val is_store = Output(UInt(numLanes.W))
val store_mask = Output(UInt((8 * numLanes).W))
val data = Output(UInt((64 * numLanes).W))
val finished = Output(Bool())
}
})
addResource("/vsrc/SimMemTrace.v")
addResource("/csrc/SimMemTrace.cc")
addResource("/csrc/SimMemTrace.h")
}
class CoalConnectTrace(implicit p: Parameters) extends LazyModule {
// TODO: use parameters for numLanes
val numLanes = 4
val coal = LazyModule(new CoalescingUnit(numLanes))
val driver = LazyModule(new MemTraceDriver(numLanes))
coal.node :=* driver.node
// Use TLTestRAM as bogus downstream TL manager nodes
// TODO: swap this out with a memtrace logger
val rams = Seq.tabulate(numLanes + 1) { _ =>
LazyModule(
// TODO: properly propagate beatBytes?
new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8)
)
}
// Connect all (N+1) outputs of coal to separate TestRAM modules
rams.foreach { r => r.node := coal.node }
lazy val module = new Impl
class Impl extends LazyModuleImp(this) with UnitTestModule {
driver.module.io.start := io.start
io.finished := driver.module.io.finished
}
}
class CoalescingUnitTest(timeout: Int = 500000)(implicit p: Parameters)
extends UnitTest(timeout) {
val dut = Module(LazyModule(new CoalConnectTrace).module)
dut.io.start := io.start
io.finished := dut.io.finished
}