From 7780250c7a34d6c4ff1091d4e842a1e38c4b2831 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 19:17:07 -0700 Subject: [PATCH] More comments & renames --- src/main/scala/tilelink/Coalescing.scala | 101 +++++---- .../scala/coalescing/CoalescingUnitTest.scala | 192 ++++++++++-------- 2 files changed, 171 insertions(+), 122 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 71b108c..deeb403 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -36,22 +36,22 @@ object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum { } case class CoalescerConfig( - numLanes: Int, // number of lanes (or threads) in a warp - maxSize: Int, // maximum burst size (64 bytes) - queueDepth: Int, // request window per lane - waitTimeout: Int, // max cycles to wait before forced fifo dequeue, per lane - addressWidth: Int, // assume <= 32 - dataBusWidth: Int, // memory-side downstream TileLink data bus size - // this has to be at least larger than the word size for - // the coalescer to perform well - // watermark = 2, // minimum buffer occupancy to start coalescing - wordSizeInBytes: Int, // 32-bit system - wordWidth: Int, // log(WORD_SIZE) - numOldSrcIds: Int, // num of outstanding requests per lane, from processor - numNewSrcIds: Int, // num of outstanding coalesced requests - respQueueDepth: Int, // depth of the response fifo queues - coalSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers - // must be power of 2's + numLanes: Int, // number of lanes (or threads) in a warp + maxSize: Int, // maximum burst size (64 bytes) + queueDepth: Int, // request window per lane + waitTimeout: Int, // max cycles to wait before forced fifo dequeue, per lane + addressWidth: Int, // assume <= 32 + dataBusWidth: Int, // memory-side downstream TileLink data bus size + // this has to be at least larger than the word size for + // the coalescer to perform well + // watermark = 2, // minimum buffer occupancy to start coalescing + wordSizeInBytes: Int, // 32-bit system + wordWidth: Int, // log(WORD_SIZE) + numOldSrcIds: Int, // num of outstanding requests per lane, from processor + numNewSrcIds: Int, // num of outstanding coalesced requests + respQueueDepth: Int, // depth of the response fifo queues + coalLogSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers + // each size is log(byteSize) sizeEnum: InFlightTableSizeEnum ) @@ -69,7 +69,7 @@ object defaultConfig extends CoalescerConfig( numOldSrcIds = 16, numNewSrcIds = 4, respQueueDepth = 4, - coalSizes = Seq(3), + coalLogSizes = Seq(3), sizeEnum = DefaultInFlightTableSizeEnum ) @@ -243,7 +243,7 @@ class CoalShiftQueue[T <: Data]( } // Software model: coalescer.py -class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], +class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], config: CoalescerConfig) extends Module { val io = IO(new Bundle { val window = Input(Vec(config.numLanes, windowT.io.cloneType)) @@ -251,6 +251,8 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W)) val baseAddr = Output(UInt(config.addressWidth.W)) val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W))) + // number of entries matched with this leader lane's head. + // maximum is numLanes * queueDepth val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth).W)) val coverageHits = Output(UInt((1 << config.maxSize).W)) }) @@ -284,7 +286,7 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], printQueueHeads } - val size = coalSize + val size = coalLogSize val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = { (req0.op === req1.op) && @@ -323,6 +325,18 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], })(chosenLeaderIdx) val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx) + // coverage calculation + def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth) + // 2-D table flattened to 1-D + val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address))) + val valids = io.window.map(_.mask).flatMap(_.asBools) + val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target => + // count if any of the queue entries accesses the given offset word of the + // coalesced chunk; if 1 for all offsets, we've reached 100% utilization + // of the coalesced data words + (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _) + } + // debug prints when (leadersValid.reduce(_ || _)) { matchCounts.zipWithIndex.foreach { case (count, i) => @@ -334,14 +348,12 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], printf("%d ", m) } printf("]\n") - } - // coverage calculation - def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth) - val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address))) - val valids = io.window.map(_.mask).flatMap(_.asBools) - val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target => - (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _) + printf("hits = [ ") + hits.foreach { m => + printf("%d ", m) + } + printf("]\n") } io.results.leaderIdx := chosenLeaderIdx @@ -356,16 +368,19 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE config: CoalescerConfig) extends Module { val io = IO(new Bundle { + // coalescing window, connected to the contents of the request queues val window = Input(Vec(config.numLanes, windowT.io.cloneType)) + // newly generated coalesced request val outReq = DecoupledIO(coalReqT.cloneType) + // invalidate signals going into each request queue's head val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))) }) - val coalescers = config.coalSizes.map(size => Module(new MonoCoalescer(size, windowT, config))) + val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config))) coalescers.foreach(_.io.window := io.window) - def normalize(x: Seq[UInt]): Seq[UInt] = { - x.zip(config.coalSizes).map { case (hits, size) => + def normalize(valPerSize: Seq[UInt]): Seq[UInt] = { + (valPerSize zip config.coalLogSizes).map { case (hits, size) => (hits << (config.maxSize - size).U).asUInt } } @@ -378,27 +393,34 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE }._2 } + // normalize to maximum coalescing size so that we can do fair comparisons + // between coalescing results of different sizes val normalizedMatches = normalize(coalescers.map(_.io.results.matchCount)) val normalizedHits = normalize(coalescers.map(_.io.results.coverageHits)) - val chosenIdx = Wire(UInt(log2Ceil(config.coalSizes.size).W)) + val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W)) val chosenValid = Wire(Bool()) // minimum 25% coverage - val minCoverage = 1.max(1 << (config.maxSize - 4)) + val minCoverage = 1.max(1 << ((config.maxSize - 2) - 2)) + printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount) + printf("normalizedMatches[0]=%d\n", normalizedMatches(0)) + printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits) + printf("normalizedHits[0]=%d\n", normalizedHits(0)) + printf("minCoverage=%d\n", minCoverage.U) when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) { - chosenIdx := argMax(normalizedHits) + chosenSizeIdx := argMax(normalizedHits) chosenValid := true.B }.elsewhen(normalizedMatches.map(_ > 1.U).reduce(_ || _)) { - chosenIdx := argMax(normalizedMatches) + chosenSizeIdx := argMax(normalizedMatches) chosenValid := true.B }.otherwise { - chosenIdx := DontCare + chosenSizeIdx := DontCare chosenValid := false.B } // create coalesced request - val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenIdx) - val chosenSize = VecInit(coalescers.map(_.size.U))(chosenIdx) + val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenSizeIdx) + val chosenSize = VecInit(coalescers.map(_.size.U))(chosenSizeIdx) // flatten requests and matches val flatReqs = io.window.flatMap(_.elts) @@ -437,13 +459,18 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE val sourceGen = Module(new ReqSourceGen(log2Ceil(config.numNewSrcIds))) sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created + val coalesceValid = chosenValid && sourceGen.io.id.valid + when (coalesceValid) { + printf("coalescing success!\n") + } + io.outReq.bits.source := sourceGen.io.id.bits io.outReq.bits.mask := mask.asUInt io.outReq.bits.data := data.asUInt io.outReq.bits.size := chosenSize io.outReq.bits.address := chosenBundle.baseAddr io.outReq.bits.op := VecInit(io.window.map(_.elts.head))(chosenBundle.leaderIdx).op - io.outReq.valid := chosenValid && sourceGen.io.id.valid + io.outReq.valid := coalesceValid io.invalidate.bits := chosenBundle.matchOH io.invalidate.valid := io.outReq.fire // invalidate only when fire diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 56afcd1..20fa053 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -35,28 +35,47 @@ class MultiPortQueueUnitTest extends AnyFlatSpec with ChiselScalatestTester { class DummyCoalescingUnitTB(implicit p: Parameters) extends LazyModule { val cpuNodes = Seq.tabulate(testConfig.numLanes) { _ => - TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLClientParameters( - name = "processor-nodes", - sourceId = IdRange(0, testConfig.numOldSrcIds), - requestFifo = true, - visibility = Seq(AddressSet(0x0, 0xffffff))))))) // 24 bit address space (TODO probably use testConfig) + TLClientNode( + Seq( + TLMasterPortParameters.v1( + Seq( + TLClientParameters( + name = "processor-nodes", + sourceId = IdRange(0, testConfig.numOldSrcIds), + requestFifo = true, + visibility = Seq(AddressSet(0x0, 0xffffff)) + ) + ) + ) + ) + ) // 24 bit address space (TODO probably use testConfig) } val device = new SimpleDevice("dummy", Seq("dummy")) val beatBytes = 1 << testConfig.dataBusWidth // 256 bit bus val l2Nodes = Seq.tabulate(5) { _ => - TLManagerNode(Seq(TLSlavePortParameters.v1(Seq(TLManagerParameters( - address = Seq(AddressSet(0x0, 0xffffff)), // should be matching cpuNode - resources = device.reg, - regionType = RegionType.UNCACHED, - executable = true, - supportsArithmetic = TransferSizes(1, beatBytes), - supportsLogical = TransferSizes(1, beatBytes), - supportsGet = TransferSizes(1, beatBytes), - supportsPutFull = TransferSizes(1, beatBytes), - supportsPutPartial = TransferSizes(1, beatBytes), - supportsHint = TransferSizes(1, beatBytes), - fifoId = Some(0))), beatBytes))) + TLManagerNode( + Seq( + TLSlavePortParameters.v1( + Seq( + TLManagerParameters( + address = Seq(AddressSet(0x0, 0xffffff)), // should be matching cpuNode + resources = device.reg, + regionType = RegionType.UNCACHED, + executable = true, + supportsArithmetic = TransferSizes(1, beatBytes), + supportsLogical = TransferSizes(1, beatBytes), + supportsGet = TransferSizes(1, beatBytes), + supportsPutFull = TransferSizes(1, beatBytes), + supportsPutPartial = TransferSizes(1, beatBytes), + supportsHint = TransferSizes(1, beatBytes), + fifoId = Some(0) + ) + ), + beatBytes + ) + ) + ) } val dut = LazyModule(new CoalescingUnit(testConfig)) @@ -80,83 +99,85 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "multi- and mono-coalescers" + implicit val p: Parameters = Parameters.empty + + val tb = LazyModule(new DummyCoalescingUnitTB()) + // val outer = LazyModule(new CoalescingUnit(testConfig)) + + val coal = tb.dut + tb.cpuNodes.foreach(coal.node := _) + tb.l2Nodes.foreach(_ := coal.node) + + def pokeA( + nodes: Seq[TLBundle], + idx: Int, + op: Int, + size: Int, + source: Int, + addr: Int, + mask: Int, + data: Int + ): Unit = { + val node = nodes(idx) +// node.a.ready.expect(true.B) // FIXME: this fails currently + node.a.bits.opcode.poke(if (op == 1) TLMessages.PutFullData else TLMessages.Get) + node.a.bits.param.poke(0.U) + node.a.bits.size.poke(size.U) + node.a.bits.source.poke(source.U) + node.a.bits.address.poke(addr.U) + node.a.bits.mask.poke(mask.U) + node.a.bits.data.poke(data.U) + node.a.bits.corrupt.poke(false.B) + node.a.valid.poke(true.B) + } + + def unsetA(nodes: Seq[TLBundle]): Unit = { + nodes.foreach { node => + node.a.valid.poke(false.B) + } + } + it should "coalesce fully consecutive accesses at size 4, only once" in { - implicit val p: Parameters = Parameters.empty - - val tb = LazyModule(new DummyCoalescingUnitTB()) -// val outer = LazyModule(new CoalescingUnit(testConfig)) - - val coal = tb.dut - tb.cpuNodes.foreach(coal.node := _) - tb.l2Nodes.foreach(_ := coal.node) - - test(tb.module).withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c => + test(tb.module) + // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) + { c => + println(s"coalIO length = ${c.coalIOs(0).length}") val nodes = c.coalIOs.map(_.head) // val nodes = c.cpuNodesImp.map(_.out.head._1) // val nodes = c.coal.node.in.map(_._1) // val nodes = c.mitmNodesImp.map(_.in.head._1) - def pokeA(nodes: Seq[TLBundle], idx: Int, op: Int, size: Int, source: Int, addr: Int, mask: Int, data: Int): Unit = { - val node = nodes(idx) -// node.a.ready.expect(true.B) // FIXME: this fails currently - node.a.bits.opcode.poke(if (op == 1) TLMessages.PutFullData else TLMessages.Get) - node.a.bits.param.poke(0.U) - node.a.bits.size.poke(size.U) - node.a.bits.source.poke(source.U) - node.a.bits.address.poke(addr.U) - node.a.bits.mask.poke(mask.U) - node.a.bits.data.poke(data.U) - node.a.bits.corrupt.poke(false.B) - node.a.valid.poke(true.B) - } - - def unsetA(): Unit = { - nodes.foreach { node => - node.a.valid.poke(false.B) - } - } - // always ready to take coalesced requests // c.coalMasterNode.head.a.ready.poke(true.B) // c.coal.module.coalescer.io.outReq.ready.poke(true.B) - pokeA(nodes, idx=0, op=1, size=2, source=0, addr=0x10, mask=0xf, data=0x1111) - pokeA(nodes, idx=1, op=1, size=2, source=0, addr=0x14, mask=0xf, data=0x2222) - pokeA(nodes, idx=2, op=1, size=2, source=0, addr=0x18, mask=0xf, data=0x3333) - pokeA(nodes, idx=3, op=1, size=2, source=0, addr=0x1c, mask=0xf, data=0x4444) + pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x10, mask = 0xf, data = 0x1111) + pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x14, mask = 0xf, data = 0x2222) + pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333) + pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x1c, mask = 0xf, data = 0x4444) c.clock.step() - unsetA() + unsetA(nodes) c.clock.step() c.clock.step() } } - it should "coalesce strided accesses at size 6" in { + it should "coalesce identical addresses (stride of 0)" in {} - } + it should "coalesce strided accesses at size 6" in {} - it should "coalesce the coalescable chunk and leave 2 uncoalescable requests" in { + it should "coalesce the coalescable chunk and leave 2 uncoalescable requests" in {} - } + it should "not touch uncoalescable requests" in {} - it should "not touch uncoalescable requests" in { + it should "allow temporal coalescing when depth >=2" in {} - } + it should "select the most coverage mono-coalescer" in {} - it should "allow temporal coalescing when depth >=2" in { - - } - - it should "select the most coverage mono-coalescer" in { - - } - - it should "resort to the backup policy when coverage is below average" in { - - } + it should "resort to the backup policy when coverage is below average" in {} } class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { @@ -381,22 +402,23 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } -object testConfig extends CoalescerConfig( - maxSize = 5, - queueDepth = 2, - waitTimeout = 8, - addressWidth = 24, - dataBusWidth = 5, - numLanes = 4, - // watermark = 2, - wordSizeInBytes = 4, - wordWidth = 2, - numOldSrcIds = 16, - numNewSrcIds = 4, - respQueueDepth = 4, - coalSizes = Seq(4, 5), - sizeEnum = DefaultInFlightTableSizeEnum -) +object testConfig + extends CoalescerConfig( + maxSize = 5, + queueDepth = 2, + waitTimeout = 8, + addressWidth = 24, + dataBusWidth = 5, + numLanes = 4, + // watermark = 2, + wordSizeInBytes = 4, + wordWidth = 2, + numOldSrcIds = 16, + numNewSrcIds = 4, + respQueueDepth = 4, + coalLogSizes = Seq(4, 5), + sizeEnum = DefaultInFlightTableSizeEnum + ) class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "uncoalescer"