diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 46e9e9a..641f399 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -90,7 +90,8 @@ class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends La // node.out.map(_._2).foreach(edge => require(edge.manager.beatBytes == config.maxCoalLogSize, // s"output edges into coalescer node does not have beatBytes = ${config.maxCoalLogSize}")) - val node = TLIdentityNode() + val aggregateNode = TLIdentityNode() + val cpuNode = TLIdentityNode() // Number of maximum in-flight coalesced requests. The upper bound of this // value would be the sourceId range of a single lane. @@ -107,8 +108,9 @@ class CoalescingUnit(config: CoalescerConfig)(implicit p: Parameters) extends La Seq(TLMasterPortParameters.v1(coalParam)) ) - // Connect master node as the first inward edge of the IdentityNode - node :=* coalescerNode + // merge coalescerNode and cpuNode + aggregateNode :=* coalescerNode + aggregateNode :=* TLWidthWidget(config.wordSizeInBytes) :=* cpuNode lazy val module = new CoalescingUnitImp(this, config) } @@ -210,7 +212,10 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e val empty = Bool() })) - val shiftHint = !io.coalescable.reduce(_ || _) + // shift hint is when the heads have no more coalescable left this or next cycle + val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, i) => + c && !(io.invalidate.valid && i) + }.reduce(_ || _) val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _) val syncedDeqValid = io.queue.deq.map(_.valid).reduce(_ || _) @@ -488,18 +493,15 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE val sel = flatReqs.zip(flatMatches).map { case (req, m) => // note: ANDing against addrMask is to conform to active byte lanes requirements // if aligning to LSB suffices, we should add the bitwise AND back - m && ((req.address(config.maxCoalLogSize - 1, 0)/* & addrMask*/) === i.U) + m && ((req.address(config.maxCoalLogSize - 1, config.wordWidth)/* & addrMask*/) === i.U) } // TODO: SW uses priority encoder, not sure about behavior of MuxCase data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) => s -> req.data }) - mask(i) := Mux(i.U < numWords, - MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) => - s -> req.mask - }), - 0.U - ) + mask(i) := MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) => + s -> req.mask + }) } val sourceGen = Module(new ReqSourceGen(log2Ceil(config.numNewSrcIds))) @@ -528,15 +530,14 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE } class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) { - // Make sure IdentityNode is connected to an upstream node, not just the - // coalescer TL master node - assert(outer.node.in.length >= 2) - assert(outer.node.in(1)._1.params.sourceBits == log2Ceil(config.numOldSrcIds), - s"old source id bits TL param (${outer.node.in(1)._1.params.sourceBits}) mismatch with config") - assert(outer.node.in(1)._1.params.addressBits == config.addressWidth, - s"address width TL param (${outer.node.in(1)._1.params.addressBits}) mismatch with config") + assert(outer.cpuNode.in.length == config.numLanes, + s"number of incoming edges (${outer.cpuNode.in.length}) is not the same as number of lanes") + assert(outer.cpuNode.in.head._1.params.sourceBits == log2Ceil(config.numOldSrcIds), + s"old source id bits TL param (${outer.cpuNode.in.head._1.params.sourceBits}) mismatch with config") + assert(outer.cpuNode.in.head._1.params.addressBits == config.addressWidth, + s"address width TL param (${outer.cpuNode.in.head._1.params.addressBits}) mismatch with config") - val sourceWidth = outer.node.in(1)._1.params.sourceBits + val sourceWidth = outer.cpuNode.in.head._1.params.sourceBits // note we are using word size. assuming all coalescer inputs are word sized val reqQueueEntryT = new ReqQueueEntry(sourceWidth, config.wordWidth, config.addressWidth, config.wordSizeInBytes) val reqQueues = Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth, config)) @@ -553,19 +554,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // Override IdentityNode implementation so that we can instantiate // queues between input and output edges to buffer requests and responses. // See IdentityNode definition in `diplomacy/Nodes.scala`. - (outer.node.in zip outer.node.out).zipWithIndex.foreach { - case (((tlIn, edgeIn), (tlOut, _)), 0) => // TODO: not necessarily 1 master edge - assert( - edgeIn.master.masters(0).name == "CoalescerNode", - "First edge is not connected to the coalescer master node" - ) - // Edge from the coalescer TL master node should simply bypass the identity node, - // except for connecting the outgoing edge to the inflight table, which is done - // down below. - tlOut.a <> tlIn.a - case (((tlIn, _), (tlOut, edgeOut)), i) => + (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach { + case (((tlIn, _), (tlOut, edgeOut)), lane) => // Request queue - val lane = i - 1 val req = Wire(reqQueueEntryT) req.op := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) @@ -592,7 +583,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends tlOut.a.bits := deq.bits.toTLA(edgeOut) } - val (tlCoal, edgeCoal) = outer.coalescerNode.out(0) + val (tlCoal, edgeCoal) = outer.coalescerNode.out.head tlCoal.a.valid := coalescer.io.coalReq.valid tlCoal.a.bits := coalescer.io.coalReq.bits.toTLA(edgeCoal) @@ -643,22 +634,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends val respQueueNoncoalPort = 0 val respQueueUncoalPortOffset = 1 - (outer.node.in zip outer.node.out).zipWithIndex.foreach { - case (((tlIn, edgeIn), (tlOut, _)), 0) => // TODO: not necessarily 1 master edge - assert( - edgeIn.master.masters(0).name == "CoalescerNode", - "First edge is not connected to the coalescer master node" - ) - // Edge from the coalescer TL master node should simply bypass the identity node, - // except for connecting the outgoing edge to the inflight table, which is done - // down below. - tlIn.d <> tlOut.d - case (((tlIn, edgeIn), (tlOut, _)), i) => + (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach { + case (((tlIn, edgeIn), (tlOut, _)), lane) => // Response queue // // This queue will serialize non-coalesced responses along with // coalesced responses and serve them back to the core side. - val lane = i - 1 val respQueue = respQueues(lane) val resp = Wire(respQueueEntryT) resp.fromTLD(tlOut.d.bits) @@ -1564,8 +1545,8 @@ class DummyCoalescer(implicit p: Parameters) extends LazyModule { val coal = LazyModule(new CoalescingUnit(defaultConfig)) - coal.node :=* driver.node - rams.foreach(_.node := coal.node) + coal.cpuNode :=* driver.node + rams.foreach(_.node := coal.aggregateNode) lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { @@ -1604,7 +1585,8 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { ) ) - memSideLogger.node :=* coal.node :=* coreSideLogger.node :=* driver.node + memSideLogger.node :=* coal.aggregateNode + coal.cpuNode :=* coreSideLogger.node :=* driver.node rams.foreach { r => r.node := memSideLogger.node } lazy val module = new Impl @@ -1653,8 +1635,8 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { ) ) - coal.node :=* driver.node - rams.foreach { r => r.node := coal.node } + coal.cpuNode :=* driver.node + rams.foreach { r => r.node := coal.aggregateNode } lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index bbf7459..8f4a77e 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -7,6 +7,7 @@ import freechips.rocketchip.tilelink._ import freechips.rocketchip.util.MultiPortQueue import freechips.rocketchip.diplomacy._ import chipsalliance.rocketchip.config.Parameters +import chisel3.util.{DecoupledIO, Valid} import chisel3.util.experimental.BoringUtils class MultiPortQueueUnitTest extends AnyFlatSpec with ChiselScalatestTester { @@ -80,11 +81,8 @@ class DummyCoalescingUnitTB(implicit p: Parameters) extends LazyModule { val dut = LazyModule(new CoalescingUnit(testConfig)) - val widthWidgets = Seq.tabulate(4) { _ => TLWidthWidget(4)} - (cpuNodes zip widthWidgets).foreach { case (cpuNode, widthWidget) => widthWidget := cpuNode} - - widthWidgets.foreach(dut.node := _) - l2Nodes.foreach(_ := dut.node) + cpuNodes.foreach(dut.cpuNode := _) + l2Nodes.foreach(_ := dut.aggregateNode) lazy val module = new DummyCoalescingUnitTBImp(this) } @@ -99,6 +97,8 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI val coalIO3 = outer.cpuNodes(3).makeIOs() val coalIOs = Seq(coalIO0, coalIO1, coalIO2, coalIO3) +// val coalMasterNode = coal.coalescerNode.makeIOs() + private val reqQueues = coal.module.reqQueues private val coalescer = coal.module.coalescer @@ -107,23 +107,30 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI private val peekIn = Seq( reqQueues.io.queue.enq.map(_.ready), + reqQueues.io.queue.enq.map(_.bits), + reqQueues.io.queue.enq.map(_.valid), reqQueues.io.queue.deq.map(_.bits), reqQueues.io.queue.deq.map(_.valid), coalescer.io.coalReq.ready, coalescer.io.coalReq.bits, coalescer.io.coalReq.valid, + coalescer.io.invalidate, ) - val reqQueueEnqReady = peekIn(0).asInstanceOf[Seq[Bool]].map(x => IO(Output(x.cloneType))) - val reqQueueDeqBits = peekIn(1).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(Output(x.cloneType))) - val reqQueueDeqValid = peekIn(2).asInstanceOf[Seq[Bool]].map(x => IO(Output(x.cloneType))) - val coalReqReady = IO(Output(peekIn(3).asInstanceOf[Bool].cloneType)) - val coalReqBits = IO(Output(peekIn(4).asInstanceOf[ReqQueueEntry].cloneType)) - val coalReqValid = IO(Output(peekIn(5).asInstanceOf[Bool].cloneType)) + val reqQueueEnqReady = peekIn(0).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType)) + val reqQueueEnqBits = peekIn(1).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(x.cloneType)) + val reqQueueEnqValid = peekIn(2).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType)) + val reqQueueDeqBits = peekIn(3).asInstanceOf[Seq[ReqQueueEntry]].map(x => IO(Output(x.cloneType))) + val reqQueueDeqValid = peekIn(4).asInstanceOf[Seq[Bool]].map(x => IO(Output(x.cloneType))) + val coalReqReady = IO(Output(peekIn(5).asInstanceOf[Bool].cloneType)) + val coalReqBits = IO(Output(peekIn(6).asInstanceOf[ReqQueueEntry].cloneType)) + val coalReqValid = IO(Output(peekIn(7).asInstanceOf[Bool].cloneType)) + val coalInvalidate = IO(Output(peekIn(8).asInstanceOf[Valid[Vec[UInt]]].cloneType)) private val peekOut = Seq( - reqQueueEnqReady, reqQueueDeqBits, reqQueueDeqValid, - coalReqReady, coalReqBits, coalReqValid, + reqQueueEnqReady, reqQueueEnqBits, reqQueueEnqValid, + reqQueueDeqBits, reqQueueDeqValid, + coalReqReady, coalReqBits, coalReqValid, coalInvalidate, ) (peekIn zip peekOut).foreach { @@ -142,14 +149,13 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI // coalescer.io.coalReq.ready ) - val reqQueueDeqReady = pokeIn(0).asInstanceOf[Seq[Bool]].map(x => IO(Input(x.cloneType))) + val reqQueueDeqReady = pokeIn(0).asInstanceOf[Seq[Bool]].map(x => IO(x.cloneType)) private val pokeOut = Seq( reqQueueDeqReady ) // TODO: doesn't work yet - /* (pokeIn zip pokeOut).foreach { case (inner: IndexedSeq[Data], outer: Seq[Data]) => (inner zip outer).foreach { case (i, o) => @@ -159,9 +165,7 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI BoringUtils.bore(inner, Seq(outer)) case _ => assert(false, "boring between different data types") - }*/ - -// val coalMasterNode = coal.coalescerNode.makeIOs() + } } object testConfig extends CoalescerConfig( @@ -176,7 +180,7 @@ object testConfig extends CoalescerConfig( numOldSrcIds = 16, numNewSrcIds = 4, respQueueDepth = 4, - coalLogSizes = Seq(3), + coalLogSizes = Seq(4, 5), sizeEnum = DefaultInFlightTableSizeEnum, arbiterOutputs = 4 ) @@ -188,13 +192,8 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { def pokeA( nodes: Seq[TLBundle], - idx: Int, - op: Int, - size: Int, - source: Int, - addr: Int, - mask: Int, - data: Int + idx: Int, op: Int, size: Int, source: Int, addr: Int, mask: Int, data: Int, + valid: Boolean = true, ): Unit = { val node = nodes(idx) // node.a.ready.expect(true.B) // FIXME: this fails currently @@ -206,7 +205,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { node.a.bits.mask.poke(mask.U) node.a.bits.data.poke(data.U) node.a.bits.corrupt.poke(false.B) - node.a.valid.poke(true.B) + node.a.valid.poke(valid.B) } def unsetA(nodes: Seq[TLBundle]): Unit = { @@ -215,31 +214,56 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } } - it should "coalesce fully consecutive accesses at size 4, only once" in { - test(LazyModule(new DummyCoalescingUnitTB()).module) - .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) - { c => - println(s"coalIO length = ${c.coalIOs(0).length}") - val nodes = c.coalIOs.map(_.head) + def expectVec[T <: Data](vec: Seq[T], value: Seq[T]): Unit = { + (vec zip value).foreach { case (a, b) => a.expect(b) } + } - c.reqQueueEnqReady.foreach(_.expect(true.B)) + it should "coalesce fully consecutive accesses at size 4, only once" in { + test(LazyModule(new DummyCoalescingUnitTB()).module) + .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) + { c => + println(s"coalIO length = ${c.coalIOs(0).length}") + val nodes = c.coalIOs.map(_.head) + // TODO: this doesn't work +// c.coalMasterNode.head.a.ready.poke(true.B) - // always ready to take non-coalesced requests - c.reqQueueDeqReady.foreach(_.poke(true.B)) + c.reqQueueEnqReady.foreach(_.expect(true.B)) + pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x10, mask = 0xf, data = 0x1111) + pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x14, mask = 0xf, data = 0x2222) + pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333) + pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x1c, mask = 0xf, data = 0x4444) + expectVec(c.reqQueueEnqBits.map(_.data), Seq(0x1111.U, 0x2222.U, 0x3333.U, 0x4444.U)) + c.clock.step() - pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x10, mask = 0xf, data = 0x1111) - pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x14, mask = 0xf, data = 0x2222) - pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333) - pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x1c, mask = 0xf, data = 0x4444) + unsetA(nodes) + c.reqQueueDeqValid.foreach(_.expect(false.B)) - c.clock.step() + c.coalReqValid.expect(true.B) + c.coalReqBits.address.expect(0x10.U) + c.coalReqBits.data.expect(BigInt("4444000033330000222200001111", 16) << 128) + c.coalReqBits.mask.expect(0xffff0000L) + c.coalReqBits.size.expect(4.U) + c.coalReqBits.op.expect(1.U) - unsetA(nodes) +// c.coalReqReady.expect(true.B) + c.reqQueueEnqReady.foreach(_.expect(true.B)) + pokeA(nodes, idx = 0, op = 1, size = 2, source = 1, addr = 0xf20, mask = 0xf, data = 0x5555) + pokeA(nodes, idx = 1, op = 1, size = 2, source = 1, addr = 0xf24, mask = 0xf, data = 0x6666, valid = false) + pokeA(nodes, idx = 2, op = 1, size = 2, source = 1, addr = 0xf28, mask = 0xf, data = 0x7777) + pokeA(nodes, idx = 3, op = 1, size = 2, source = 1, addr = 0xf2c, mask = 0xf, data = 0x8888, valid = false) + c.clock.step() - c.clock.step() - c.clock.step() - } - } + c.coalReqValid.expect(true.B) + c.coalReqBits.address.expect(0xf20.U) + c.coalReqBits.data.expect(BigInt("77770000000000005555", 16)) // technically these can be dontcare's + c.coalReqBits.mask.expect(0x0000ffff) + c.coalReqBits.size.expect(4.U) + c.coalReqBits.op.expect(1.U) + + c.clock.step() + c.clock.step() + } + } it should "coalesce identical addresses (stride of 0)" in { test(LazyModule(new DummyCoalescingUnitTB()).module)