From 7780250c7a34d6c4ff1091d4e842a1e38c4b2831 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 19:17:07 -0700 Subject: [PATCH 01/17] More comments & renames --- src/main/scala/tilelink/Coalescing.scala | 101 +++++---- .../scala/coalescing/CoalescingUnitTest.scala | 192 ++++++++++-------- 2 files changed, 171 insertions(+), 122 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 71b108c..deeb403 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -36,22 +36,22 @@ object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum { } case class CoalescerConfig( - numLanes: Int, // number of lanes (or threads) in a warp - maxSize: Int, // maximum burst size (64 bytes) - queueDepth: Int, // request window per lane - waitTimeout: Int, // max cycles to wait before forced fifo dequeue, per lane - addressWidth: Int, // assume <= 32 - dataBusWidth: Int, // memory-side downstream TileLink data bus size - // this has to be at least larger than the word size for - // the coalescer to perform well - // watermark = 2, // minimum buffer occupancy to start coalescing - wordSizeInBytes: Int, // 32-bit system - wordWidth: Int, // log(WORD_SIZE) - numOldSrcIds: Int, // num of outstanding requests per lane, from processor - numNewSrcIds: Int, // num of outstanding coalesced requests - respQueueDepth: Int, // depth of the response fifo queues - coalSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers - // must be power of 2's + numLanes: Int, // number of lanes (or threads) in a warp + maxSize: Int, // maximum burst size (64 bytes) + queueDepth: Int, // request window per lane + waitTimeout: Int, // max cycles to wait before forced fifo dequeue, per lane + addressWidth: Int, // assume <= 32 + dataBusWidth: Int, // memory-side downstream TileLink data bus size + // this has to be at least larger than the word size for + // the coalescer to perform well + // watermark = 2, // minimum buffer occupancy to start coalescing + wordSizeInBytes: Int, // 32-bit system + wordWidth: Int, // log(WORD_SIZE) + numOldSrcIds: Int, // num of outstanding requests per lane, from processor + numNewSrcIds: Int, // num of outstanding coalesced requests + respQueueDepth: Int, // depth of the response fifo queues + coalLogSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers + // each size is log(byteSize) sizeEnum: InFlightTableSizeEnum ) @@ -69,7 +69,7 @@ object defaultConfig extends CoalescerConfig( numOldSrcIds = 16, numNewSrcIds = 4, respQueueDepth = 4, - coalSizes = Seq(3), + coalLogSizes = Seq(3), sizeEnum = DefaultInFlightTableSizeEnum ) @@ -243,7 +243,7 @@ class CoalShiftQueue[T <: Data]( } // Software model: coalescer.py -class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], +class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], config: CoalescerConfig) extends Module { val io = IO(new Bundle { val window = Input(Vec(config.numLanes, windowT.io.cloneType)) @@ -251,6 +251,8 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W)) val baseAddr = Output(UInt(config.addressWidth.W)) val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W))) + // number of entries matched with this leader lane's head. + // maximum is numLanes * queueDepth val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth).W)) val coverageHits = Output(UInt((1 << config.maxSize).W)) }) @@ -284,7 +286,7 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], printQueueHeads } - val size = coalSize + val size = coalLogSize val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = { (req0.op === req1.op) && @@ -323,6 +325,18 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], })(chosenLeaderIdx) val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx) + // coverage calculation + def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth) + // 2-D table flattened to 1-D + val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address))) + val valids = io.window.map(_.mask).flatMap(_.asBools) + val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target => + // count if any of the queue entries accesses the given offset word of the + // coalesced chunk; if 1 for all offsets, we've reached 100% utilization + // of the coalesced data words + (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _) + } + // debug prints when (leadersValid.reduce(_ || _)) { matchCounts.zipWithIndex.foreach { case (count, i) => @@ -334,14 +348,12 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], printf("%d ", m) } printf("]\n") - } - // coverage calculation - def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth) - val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address))) - val valids = io.window.map(_.mask).flatMap(_.asBools) - val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target => - (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _) + printf("hits = [ ") + hits.foreach { m => + printf("%d ", m) + } + printf("]\n") } io.results.leaderIdx := chosenLeaderIdx @@ -356,16 +368,19 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE config: CoalescerConfig) extends Module { val io = IO(new Bundle { + // coalescing window, connected to the contents of the request queues val window = Input(Vec(config.numLanes, windowT.io.cloneType)) + // newly generated coalesced request val outReq = DecoupledIO(coalReqT.cloneType) + // invalidate signals going into each request queue's head val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))) }) - val coalescers = config.coalSizes.map(size => Module(new MonoCoalescer(size, windowT, config))) + val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config))) coalescers.foreach(_.io.window := io.window) - def normalize(x: Seq[UInt]): Seq[UInt] = { - x.zip(config.coalSizes).map { case (hits, size) => + def normalize(valPerSize: Seq[UInt]): Seq[UInt] = { + (valPerSize zip config.coalLogSizes).map { case (hits, size) => (hits << (config.maxSize - size).U).asUInt } } @@ -378,27 +393,34 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE }._2 } + // normalize to maximum coalescing size so that we can do fair comparisons + // between coalescing results of different sizes val normalizedMatches = normalize(coalescers.map(_.io.results.matchCount)) val normalizedHits = normalize(coalescers.map(_.io.results.coverageHits)) - val chosenIdx = Wire(UInt(log2Ceil(config.coalSizes.size).W)) + val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W)) val chosenValid = Wire(Bool()) // minimum 25% coverage - val minCoverage = 1.max(1 << (config.maxSize - 4)) + val minCoverage = 1.max(1 << ((config.maxSize - 2) - 2)) + printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount) + printf("normalizedMatches[0]=%d\n", normalizedMatches(0)) + printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits) + printf("normalizedHits[0]=%d\n", normalizedHits(0)) + printf("minCoverage=%d\n", minCoverage.U) when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) { - chosenIdx := argMax(normalizedHits) + chosenSizeIdx := argMax(normalizedHits) chosenValid := true.B }.elsewhen(normalizedMatches.map(_ > 1.U).reduce(_ || _)) { - chosenIdx := argMax(normalizedMatches) + chosenSizeIdx := argMax(normalizedMatches) chosenValid := true.B }.otherwise { - chosenIdx := DontCare + chosenSizeIdx := DontCare chosenValid := false.B } // create coalesced request - val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenIdx) - val chosenSize = VecInit(coalescers.map(_.size.U))(chosenIdx) + val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenSizeIdx) + val chosenSize = VecInit(coalescers.map(_.size.U))(chosenSizeIdx) // flatten requests and matches val flatReqs = io.window.flatMap(_.elts) @@ -437,13 +459,18 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE val sourceGen = Module(new ReqSourceGen(log2Ceil(config.numNewSrcIds))) sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created + val coalesceValid = chosenValid && sourceGen.io.id.valid + when (coalesceValid) { + printf("coalescing success!\n") + } + io.outReq.bits.source := sourceGen.io.id.bits io.outReq.bits.mask := mask.asUInt io.outReq.bits.data := data.asUInt io.outReq.bits.size := chosenSize io.outReq.bits.address := chosenBundle.baseAddr io.outReq.bits.op := VecInit(io.window.map(_.elts.head))(chosenBundle.leaderIdx).op - io.outReq.valid := chosenValid && sourceGen.io.id.valid + io.outReq.valid := coalesceValid io.invalidate.bits := chosenBundle.matchOH io.invalidate.valid := io.outReq.fire // invalidate only when fire diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 56afcd1..20fa053 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -35,28 +35,47 @@ class MultiPortQueueUnitTest extends AnyFlatSpec with ChiselScalatestTester { class DummyCoalescingUnitTB(implicit p: Parameters) extends LazyModule { val cpuNodes = Seq.tabulate(testConfig.numLanes) { _ => - TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLClientParameters( - name = "processor-nodes", - sourceId = IdRange(0, testConfig.numOldSrcIds), - requestFifo = true, - visibility = Seq(AddressSet(0x0, 0xffffff))))))) // 24 bit address space (TODO probably use testConfig) + TLClientNode( + Seq( + TLMasterPortParameters.v1( + Seq( + TLClientParameters( + name = "processor-nodes", + sourceId = IdRange(0, testConfig.numOldSrcIds), + requestFifo = true, + visibility = Seq(AddressSet(0x0, 0xffffff)) + ) + ) + ) + ) + ) // 24 bit address space (TODO probably use testConfig) } val device = new SimpleDevice("dummy", Seq("dummy")) val beatBytes = 1 << testConfig.dataBusWidth // 256 bit bus val l2Nodes = Seq.tabulate(5) { _ => - TLManagerNode(Seq(TLSlavePortParameters.v1(Seq(TLManagerParameters( - address = Seq(AddressSet(0x0, 0xffffff)), // should be matching cpuNode - resources = device.reg, - regionType = RegionType.UNCACHED, - executable = true, - supportsArithmetic = TransferSizes(1, beatBytes), - supportsLogical = TransferSizes(1, beatBytes), - supportsGet = TransferSizes(1, beatBytes), - supportsPutFull = TransferSizes(1, beatBytes), - supportsPutPartial = TransferSizes(1, beatBytes), - supportsHint = TransferSizes(1, beatBytes), - fifoId = Some(0))), beatBytes))) + TLManagerNode( + Seq( + TLSlavePortParameters.v1( + Seq( + TLManagerParameters( + address = Seq(AddressSet(0x0, 0xffffff)), // should be matching cpuNode + resources = device.reg, + regionType = RegionType.UNCACHED, + executable = true, + supportsArithmetic = TransferSizes(1, beatBytes), + supportsLogical = TransferSizes(1, beatBytes), + supportsGet = TransferSizes(1, beatBytes), + supportsPutFull = TransferSizes(1, beatBytes), + supportsPutPartial = TransferSizes(1, beatBytes), + supportsHint = TransferSizes(1, beatBytes), + fifoId = Some(0) + ) + ), + beatBytes + ) + ) + ) } val dut = LazyModule(new CoalescingUnit(testConfig)) @@ -80,83 +99,85 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "multi- and mono-coalescers" + implicit val p: Parameters = Parameters.empty + + val tb = LazyModule(new DummyCoalescingUnitTB()) + // val outer = LazyModule(new CoalescingUnit(testConfig)) + + val coal = tb.dut + tb.cpuNodes.foreach(coal.node := _) + tb.l2Nodes.foreach(_ := coal.node) + + def pokeA( + nodes: Seq[TLBundle], + idx: Int, + op: Int, + size: Int, + source: Int, + addr: Int, + mask: Int, + data: Int + ): Unit = { + val node = nodes(idx) +// node.a.ready.expect(true.B) // FIXME: this fails currently + node.a.bits.opcode.poke(if (op == 1) TLMessages.PutFullData else TLMessages.Get) + node.a.bits.param.poke(0.U) + node.a.bits.size.poke(size.U) + node.a.bits.source.poke(source.U) + node.a.bits.address.poke(addr.U) + node.a.bits.mask.poke(mask.U) + node.a.bits.data.poke(data.U) + node.a.bits.corrupt.poke(false.B) + node.a.valid.poke(true.B) + } + + def unsetA(nodes: Seq[TLBundle]): Unit = { + nodes.foreach { node => + node.a.valid.poke(false.B) + } + } + it should "coalesce fully consecutive accesses at size 4, only once" in { - implicit val p: Parameters = Parameters.empty - - val tb = LazyModule(new DummyCoalescingUnitTB()) -// val outer = LazyModule(new CoalescingUnit(testConfig)) - - val coal = tb.dut - tb.cpuNodes.foreach(coal.node := _) - tb.l2Nodes.foreach(_ := coal.node) - - test(tb.module).withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c => + test(tb.module) + // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) + { c => + println(s"coalIO length = ${c.coalIOs(0).length}") val nodes = c.coalIOs.map(_.head) // val nodes = c.cpuNodesImp.map(_.out.head._1) // val nodes = c.coal.node.in.map(_._1) // val nodes = c.mitmNodesImp.map(_.in.head._1) - def pokeA(nodes: Seq[TLBundle], idx: Int, op: Int, size: Int, source: Int, addr: Int, mask: Int, data: Int): Unit = { - val node = nodes(idx) -// node.a.ready.expect(true.B) // FIXME: this fails currently - node.a.bits.opcode.poke(if (op == 1) TLMessages.PutFullData else TLMessages.Get) - node.a.bits.param.poke(0.U) - node.a.bits.size.poke(size.U) - node.a.bits.source.poke(source.U) - node.a.bits.address.poke(addr.U) - node.a.bits.mask.poke(mask.U) - node.a.bits.data.poke(data.U) - node.a.bits.corrupt.poke(false.B) - node.a.valid.poke(true.B) - } - - def unsetA(): Unit = { - nodes.foreach { node => - node.a.valid.poke(false.B) - } - } - // always ready to take coalesced requests // c.coalMasterNode.head.a.ready.poke(true.B) // c.coal.module.coalescer.io.outReq.ready.poke(true.B) - pokeA(nodes, idx=0, op=1, size=2, source=0, addr=0x10, mask=0xf, data=0x1111) - pokeA(nodes, idx=1, op=1, size=2, source=0, addr=0x14, mask=0xf, data=0x2222) - pokeA(nodes, idx=2, op=1, size=2, source=0, addr=0x18, mask=0xf, data=0x3333) - pokeA(nodes, idx=3, op=1, size=2, source=0, addr=0x1c, mask=0xf, data=0x4444) + pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x10, mask = 0xf, data = 0x1111) + pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x14, mask = 0xf, data = 0x2222) + pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333) + pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x1c, mask = 0xf, data = 0x4444) c.clock.step() - unsetA() + unsetA(nodes) c.clock.step() c.clock.step() } } - it should "coalesce strided accesses at size 6" in { + it should "coalesce identical addresses (stride of 0)" in {} - } + it should "coalesce strided accesses at size 6" in {} - it should "coalesce the coalescable chunk and leave 2 uncoalescable requests" in { + it should "coalesce the coalescable chunk and leave 2 uncoalescable requests" in {} - } + it should "not touch uncoalescable requests" in {} - it should "not touch uncoalescable requests" in { + it should "allow temporal coalescing when depth >=2" in {} - } + it should "select the most coverage mono-coalescer" in {} - it should "allow temporal coalescing when depth >=2" in { - - } - - it should "select the most coverage mono-coalescer" in { - - } - - it should "resort to the backup policy when coverage is below average" in { - - } + it should "resort to the backup policy when coverage is below average" in {} } class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { @@ -381,22 +402,23 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } -object testConfig extends CoalescerConfig( - maxSize = 5, - queueDepth = 2, - waitTimeout = 8, - addressWidth = 24, - dataBusWidth = 5, - numLanes = 4, - // watermark = 2, - wordSizeInBytes = 4, - wordWidth = 2, - numOldSrcIds = 16, - numNewSrcIds = 4, - respQueueDepth = 4, - coalSizes = Seq(4, 5), - sizeEnum = DefaultInFlightTableSizeEnum -) +object testConfig + extends CoalescerConfig( + maxSize = 5, + queueDepth = 2, + waitTimeout = 8, + addressWidth = 24, + dataBusWidth = 5, + numLanes = 4, + // watermark = 2, + wordSizeInBytes = 4, + wordWidth = 2, + numOldSrcIds = 16, + numNewSrcIds = 4, + respQueueDepth = 4, + coalLogSizes = Seq(4, 5), + sizeEnum = DefaultInFlightTableSizeEnum + ) class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "uncoalescer" From 84ac332637583a447bd2c6828604fb0a9d0fdf63 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 19:51:17 -0700 Subject: [PATCH 02/17] Add chiseltest for stride = 0 --- .../scala/coalescing/CoalescingUnitTest.scala | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 20fa053..c31e97c 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -101,12 +101,15 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { implicit val p: Parameters = Parameters.empty - val tb = LazyModule(new DummyCoalescingUnitTB()) - // val outer = LazyModule(new CoalescingUnit(testConfig)) + def makeTb() = { + val tb = LazyModule(new DummyCoalescingUnitTB()) + // val outer = LazyModule(new CoalescingUnit(testConfig)) - val coal = tb.dut - tb.cpuNodes.foreach(coal.node := _) - tb.l2Nodes.foreach(_ := coal.node) + val coal = tb.dut + tb.cpuNodes.foreach(coal.node := _) + tb.l2Nodes.foreach(_ := coal.node) + tb + } def pokeA( nodes: Seq[TLBundle], @@ -138,8 +141,8 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } it should "coalesce fully consecutive accesses at size 4, only once" in { - test(tb.module) - // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) + test(makeTb().module) + .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c => println(s"coalIO length = ${c.coalIOs(0).length}") val nodes = c.coalIOs.map(_.head) @@ -165,7 +168,28 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } } - it should "coalesce identical addresses (stride of 0)" in {} + it should "coalesce identical addresses (stride of 0)" in { + test(makeTb().module) + .withAnnotations(Seq(VcsBackendAnnotation)) + { c => + println(s"coalIO length = ${c.coalIOs(0).length}") + val nodes = c.coalIOs.map(_.head) + + pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x1111) + pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x2222) + pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333) + pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x4444) + + c.clock.step() + + unsetA(nodes) + + c.clock.step() + c.clock.step() + + nodes(0).a.ready.expect(true.B) + } + } it should "coalesce strided accesses at size 6" in {} From ba2bc3020b5189746e5ae5671034b78396f743d3 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 20:24:15 -0700 Subject: [PATCH 03/17] Fix truncation bug in matchCount --- src/main/scala/tilelink/Coalescing.scala | 18 +++-- .../scala/coalescing/CoalescingUnitTest.scala | 81 +++++++++---------- 2 files changed, 49 insertions(+), 50 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index deeb403..cf8c330 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -253,7 +253,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W))) // number of entries matched with this leader lane's head. // maximum is numLanes * queueDepth - val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth).W)) + val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W)) val coverageHits = Output(UInt((1 << config.maxSize).W)) }) }) @@ -307,7 +307,9 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], } // TODO: potentially expensive: popcount & adder - val matchCounts = matchTablePerLane.map(leader => leader.map(PopCount(_)).reduce(_ +& _)) + val matchCounts = matchTablePerLane.map(table => + table.map(PopCount(_)) // sum up each column + .reduce(_ +& _)) val canCoalesce = matchCounts.map(_ > 1.U) // TODO: potentially expensive @@ -330,10 +332,10 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // 2-D table flattened to 1-D val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address))) val valids = io.window.map(_.mask).flatMap(_.asBools) + // indicates whether each word in the coalesced chunk is accessed by any of the + // queue entries. e.g. if [ 1 1 1 1 ], all of the four words in the coalesced + // data has been accessed and we've reached 100% utilization. val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target => - // count if any of the queue entries accesses the given offset word of the - // coalesced chunk; if 1 for all offsets, we've reached 100% utilization - // of the coalesced data words (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _) } @@ -348,6 +350,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], printf("%d ", m) } printf("]\n") + printf("chosenMatchCount = %d\n", chosenMatchCount) printf("hits = [ ") hits.foreach { m => @@ -410,9 +413,11 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) { chosenSizeIdx := argMax(normalizedHits) chosenValid := true.B + printf("coalescing success by coverage policy\n") }.elsewhen(normalizedMatches.map(_ > 1.U).reduce(_ || _)) { chosenSizeIdx := argMax(normalizedMatches) chosenValid := true.B + printf("coalescing success by matches policy\n") }.otherwise { chosenSizeIdx := DontCare chosenValid := false.B @@ -460,9 +465,6 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created val coalesceValid = chosenValid && sourceGen.io.id.valid - when (coalesceValid) { - printf("coalescing success!\n") - } io.outReq.bits.source := sourceGen.io.id.bits io.outReq.bits.mask := mask.asUInt diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index c31e97c..0e79ef1 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -96,6 +96,23 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI // val coalMasterNode = coal.coalescerNode.makeIOs() } +object testConfig extends CoalescerConfig( + numLanes = 4, + maxSize = 3, + queueDepth = 1, + waitTimeout = 8, + addressWidth = 24, + dataBusWidth = 5, + // watermark = 2, + wordSizeInBytes = 4, + wordWidth = 2, + numOldSrcIds = 16, + numNewSrcIds = 4, + respQueueDepth = 4, + coalLogSizes = Seq(3), + sizeEnum = DefaultInFlightTableSizeEnum +) + class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "multi- and mono-coalescers" @@ -140,33 +157,33 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } } - it should "coalesce fully consecutive accesses at size 4, only once" in { - test(makeTb().module) - .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) - { c => - println(s"coalIO length = ${c.coalIOs(0).length}") - val nodes = c.coalIOs.map(_.head) -// val nodes = c.cpuNodesImp.map(_.out.head._1) -// val nodes = c.coal.node.in.map(_._1) -// val nodes = c.mitmNodesImp.map(_.in.head._1) + // it should "coalesce fully consecutive accesses at size 4, only once" in { + // test(makeTb().module) + // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) + // { c => + // println(s"coalIO length = ${c.coalIOs(0).length}") + // val nodes = c.coalIOs.map(_.head) +// // val nodes = c.cpuNodesImp.map(_.out.head._1) +// // val nodes = c.coal.node.in.map(_._1) +// // val nodes = c.mitmNodesImp.map(_.in.head._1) - // always ready to take coalesced requests -// c.coalMasterNode.head.a.ready.poke(true.B) -// c.coal.module.coalescer.io.outReq.ready.poke(true.B) + // // always ready to take coalesced requests +// // c.coalMasterNode.head.a.ready.poke(true.B) +// // c.coal.module.coalescer.io.outReq.ready.poke(true.B) - pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x10, mask = 0xf, data = 0x1111) - pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x14, mask = 0xf, data = 0x2222) - pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333) - pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x1c, mask = 0xf, data = 0x4444) + // pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x10, mask = 0xf, data = 0x1111) + // pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x14, mask = 0xf, data = 0x2222) + // pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333) + // pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x1c, mask = 0xf, data = 0x4444) - c.clock.step() + // c.clock.step() - unsetA(nodes) + // unsetA(nodes) - c.clock.step() - c.clock.step() - } - } + // c.clock.step() + // c.clock.step() + // } + // } it should "coalesce identical addresses (stride of 0)" in { test(makeTb().module) @@ -186,8 +203,6 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { c.clock.step() c.clock.step() - - nodes(0).a.ready.expect(true.B) } } @@ -426,24 +441,6 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } -object testConfig - extends CoalescerConfig( - maxSize = 5, - queueDepth = 2, - waitTimeout = 8, - addressWidth = 24, - dataBusWidth = 5, - numLanes = 4, - // watermark = 2, - wordSizeInBytes = 4, - wordWidth = 2, - numOldSrcIds = 16, - numNewSrcIds = 4, - respQueueDepth = 4, - coalLogSizes = Seq(4, 5), - sizeEnum = DefaultInFlightTableSizeEnum - ) - class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "uncoalescer" val numLanes = 4 From 699520073ec57cfa78088a73427de329f31700ad Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 20:34:20 -0700 Subject: [PATCH 04/17] Make maxSize accessor function --- src/main/scala/tilelink/Coalescing.scala | 53 ++++++++++--------- .../scala/coalescing/CoalescingUnitTest.scala | 1 - 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index cf8c330..4eb530c 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -37,7 +37,6 @@ object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum { case class CoalescerConfig( numLanes: Int, // number of lanes (or threads) in a warp - maxSize: Int, // maximum burst size (64 bytes) queueDepth: Int, // request window per lane waitTimeout: Int, // max cycles to wait before forced fifo dequeue, per lane addressWidth: Int, // assume <= 32 @@ -52,13 +51,15 @@ case class CoalescerConfig( respQueueDepth: Int, // depth of the response fifo queues coalLogSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers // each size is log(byteSize) - sizeEnum: InFlightTableSizeEnum -) + sizeEnum: InFlightTableSizeEnum, +) { + // maximum coalesced size + def maxCoalLogSize(): Int = coalLogSizes.max +} object defaultConfig extends CoalescerConfig( numLanes = 4, // TODO: bigger size - maxSize = 3, queueDepth = 1, waitTimeout = 8, addressWidth = 24, @@ -254,7 +255,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // number of entries matched with this leader lane's head. // maximum is numLanes * queueDepth val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W)) - val coverageHits = Output(UInt((1 << config.maxSize).W)) + val coverageHits = Output(UInt((1 << config.maxCoalLogSize()).W)) }) }) @@ -384,7 +385,7 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE def normalize(valPerSize: Seq[UInt]): Seq[UInt] = { (valPerSize zip config.coalLogSizes).map { case (hits, size) => - (hits << (config.maxSize - size).U).asUInt + (hits << (config.maxCoalLogSize() - size).U).asUInt } } @@ -404,12 +405,8 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W)) val chosenValid = Wire(Bool()) // minimum 25% coverage - val minCoverage = 1.max(1 << ((config.maxSize - 2) - 2)) - printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount) - printf("normalizedMatches[0]=%d\n", normalizedMatches(0)) - printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits) - printf("normalizedHits[0]=%d\n", normalizedHits(0)) - printf("minCoverage=%d\n", minCoverage.U) + val minCoverage = 1.max(1 << ((config.maxCoalLogSize() - 2) - 2)) + when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) { chosenSizeIdx := argMax(normalizedHits) chosenValid := true.B @@ -423,6 +420,14 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE chosenValid := false.B } + def debugPolicyPrint() = { + printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount) + printf("normalizedMatches[0]=%d\n", normalizedMatches(0)) + printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits) + printf("normalizedHits[0]=%d\n", normalizedHits(0)) + printf("minCoverage=%d\n", minCoverage.U) + } + // create coalesced request val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenSizeIdx) val chosenSize = VecInit(coalescers.map(_.size.U))(chosenSizeIdx) @@ -438,8 +443,8 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE // note: this is word-level coalescing. if finer granularity is needed, need to modify code val numWords = (1.U << (chosenSize - config.wordWidth.U)).asUInt - val maxWords = 1 << (config.maxSize - config.wordWidth) - val addrMask = Wire(UInt(config.maxSize.W)) + val maxWords = 1 << (config.maxCoalLogSize() - config.wordWidth) + val addrMask = Wire(UInt(config.maxCoalLogSize().W)) addrMask := (1.U << chosenSize).asUInt - 1.U val data = Wire(Vec(maxWords, UInt((config.wordSizeInBytes * 8).W))) @@ -447,7 +452,7 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE for (i <- 0 until maxWords) { val sel = flatReqs.zip(flatMatches).map { case (req, m) => - m && ((req.address(config.maxSize - 1, 0) & addrMask) === i.U) + m && ((req.address(config.maxCoalLogSize() - 1, 0) & addrMask) === i.U) } // TODO: SW uses priority encoder, not sure about behavior of MuxCase data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) => @@ -500,7 +505,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth)) } - val coalReqT = new ReqQueueEntry(sourceWidth, log2Ceil(config.maxSize), config.addressWidth, config.maxSize) + val coalReqT = new ReqQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize()), config.addressWidth, config.maxCoalLogSize()) val coalescer = Module(new MultiCoalescer(reqQueues.head, coalReqT, config)) coalescer.io.window := reqQueues.map(_.io) @@ -570,7 +575,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // coalesced request. Upper bound is min(DEPTH, 2**sourceWidth). val numPerLaneReqs = config.queueDepth - val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxSize), config.maxSize) + val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize()), config.maxCoalLogSize()) val respQueues = Seq.tabulate(config.numLanes) { _ => Module( new MultiPortQueue( @@ -690,11 +695,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends newEntry.source := coalescer.io.outReq.bits.source // TODO: richard to write table fill logic - // FIXME: this assertion used to say 1 << config.MAX_SIZE - // I changed this to say DATA BUS SIZE. We need another assertion - // to assert that MAX_SIZE is <= DATA_BUS_SIZE because we do not support - // multi-beat writes currently - assert( + assert (config.maxCoalLogSize() <= config.dataBusWidth, + "multi-beat coalesced reads/writes are currently not supported") + assert ( tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8, s"tlCoal param dataBits (${tlCoal.params.dataBits}) mismatch coalescer constant" + s" (${(1 << config.dataBusWidth) * 8})" @@ -705,7 +708,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // TODO: this part needs the actual coalescing logic to work r.valid := false.B r.source := origReqs(i).source - r.offset := (origReqs(i).address % (1 << config.maxSize).U) >> config.wordWidth + r.offset := (origReqs(i).address % (1 << config.maxCoalLogSize()).U) >> config.wordWidth r.sizeEnum := config.sizeEnum.logSizeToEnum(origReqs(i).size) } } @@ -759,7 +762,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // FIXME: overlaps with RespQueueEntry. Trait-ify class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle { val source = UInt(log2Ceil(config.numNewSrcIds).W) - val data = UInt((8 * (1 << config.maxSize)).W) + val data = UInt((8 * (1 << config.maxCoalLogSize())).W) } class UncoalescingUnit(config: CoalescerConfig) extends Module { @@ -868,7 +871,7 @@ class InflightCoalReqTable(config: CoalescerConfig) extends Module { config.numLanes, config.queueDepth, log2Ceil(config.numOldSrcIds), - config.maxSize, + config.maxCoalLogSize(), config.sizeEnum ) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 0e79ef1..799a164 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -98,7 +98,6 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI object testConfig extends CoalescerConfig( numLanes = 4, - maxSize = 3, queueDepth = 1, waitTimeout = 8, addressWidth = 24, From dfdd4f8342502b5b550bf4e149d001432bf0cf65 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 20:36:04 -0700 Subject: [PATCH 05/17] Redundant call syntax --- src/main/scala/tilelink/Coalescing.scala | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4eb530c..110cc04 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -54,7 +54,7 @@ case class CoalescerConfig( sizeEnum: InFlightTableSizeEnum, ) { // maximum coalesced size - def maxCoalLogSize(): Int = coalLogSizes.max + def maxCoalLogSize: Int = coalLogSizes.max } object defaultConfig extends CoalescerConfig( @@ -255,7 +255,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // number of entries matched with this leader lane's head. // maximum is numLanes * queueDepth val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth + 1).W)) - val coverageHits = Output(UInt((1 << config.maxCoalLogSize()).W)) + val coverageHits = Output(UInt((1 << config.maxCoalLogSize).W)) }) }) @@ -385,7 +385,7 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE def normalize(valPerSize: Seq[UInt]): Seq[UInt] = { (valPerSize zip config.coalLogSizes).map { case (hits, size) => - (hits << (config.maxCoalLogSize() - size).U).asUInt + (hits << (config.maxCoalLogSize - size).U).asUInt } } @@ -405,7 +405,7 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W)) val chosenValid = Wire(Bool()) // minimum 25% coverage - val minCoverage = 1.max(1 << ((config.maxCoalLogSize() - 2) - 2)) + val minCoverage = 1.max(1 << ((config.maxCoalLogSize - 2) - 2)) when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) { chosenSizeIdx := argMax(normalizedHits) @@ -443,8 +443,8 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE // note: this is word-level coalescing. if finer granularity is needed, need to modify code val numWords = (1.U << (chosenSize - config.wordWidth.U)).asUInt - val maxWords = 1 << (config.maxCoalLogSize() - config.wordWidth) - val addrMask = Wire(UInt(config.maxCoalLogSize().W)) + val maxWords = 1 << (config.maxCoalLogSize - config.wordWidth) + val addrMask = Wire(UInt(config.maxCoalLogSize.W)) addrMask := (1.U << chosenSize).asUInt - 1.U val data = Wire(Vec(maxWords, UInt((config.wordSizeInBytes * 8).W))) @@ -452,7 +452,7 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE for (i <- 0 until maxWords) { val sel = flatReqs.zip(flatMatches).map { case (req, m) => - m && ((req.address(config.maxCoalLogSize() - 1, 0) & addrMask) === i.U) + m && ((req.address(config.maxCoalLogSize - 1, 0) & addrMask) === i.U) } // TODO: SW uses priority encoder, not sure about behavior of MuxCase data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) => @@ -505,7 +505,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends Module(new CoalShiftQueue(reqQueueEntryT, config.queueDepth)) } - val coalReqT = new ReqQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize()), config.addressWidth, config.maxCoalLogSize()) + val coalReqT = new ReqQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize), config.addressWidth, config.maxCoalLogSize) val coalescer = Module(new MultiCoalescer(reqQueues.head, coalReqT, config)) coalescer.io.window := reqQueues.map(_.io) @@ -575,7 +575,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // coalesced request. Upper bound is min(DEPTH, 2**sourceWidth). val numPerLaneReqs = config.queueDepth - val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize()), config.maxCoalLogSize()) + val respQueueEntryT = new RespQueueEntry(sourceWidth, log2Ceil(config.maxCoalLogSize), config.maxCoalLogSize) val respQueues = Seq.tabulate(config.numLanes) { _ => Module( new MultiPortQueue( @@ -695,7 +695,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends newEntry.source := coalescer.io.outReq.bits.source // TODO: richard to write table fill logic - assert (config.maxCoalLogSize() <= config.dataBusWidth, + assert (config.maxCoalLogSize <= config.dataBusWidth, "multi-beat coalesced reads/writes are currently not supported") assert ( tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8, @@ -708,7 +708,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // TODO: this part needs the actual coalescing logic to work r.valid := false.B r.source := origReqs(i).source - r.offset := (origReqs(i).address % (1 << config.maxCoalLogSize()).U) >> config.wordWidth + r.offset := (origReqs(i).address % (1 << config.maxCoalLogSize).U) >> config.wordWidth r.sizeEnum := config.sizeEnum.logSizeToEnum(origReqs(i).size) } } @@ -762,7 +762,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // FIXME: overlaps with RespQueueEntry. Trait-ify class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle { val source = UInt(log2Ceil(config.numNewSrcIds).W) - val data = UInt((8 * (1 << config.maxCoalLogSize())).W) + val data = UInt((8 * (1 << config.maxCoalLogSize)).W) } class UncoalescingUnit(config: CoalescerConfig) extends Module { @@ -871,7 +871,7 @@ class InflightCoalReqTable(config: CoalescerConfig) extends Module { config.numLanes, config.queueDepth, log2Ceil(config.numOldSrcIds), - config.maxCoalLogSize(), + config.maxCoalLogSize, config.sizeEnum ) From 7f821a66f53ad2ca9b281e5b84144b7e491b1d8c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 21:19:56 -0700 Subject: [PATCH 06/17] First attempt at hooking up coalescer to uncoalescer --- src/main/scala/tilelink/Coalescing.scala | 91 +++++++++++++----------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 110cc04..01b0ff0 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -220,11 +220,7 @@ class CoalShiftQueue[T <: Data]( } io.queue.enq.ready := !valid(entries - 1) - // We don't want to invalidate deq.valid response right away even when - // io.invalidate(head) is true. - // Coalescing unit consumes queue head's validity, and produces its new - // validity. Deasserting deq.valid right away will result in a combinational - // cycle. + // TODO: making this validAfterInv(0) might be useful for the arbiter io.queue.deq.valid := valid(0) io.queue.deq.bits := elts.head @@ -370,12 +366,11 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // Software model: coalescer.py class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry, config: CoalescerConfig) extends Module { - val io = IO(new Bundle { // coalescing window, connected to the contents of the request queues val window = Input(Vec(config.numLanes, windowT.io.cloneType)) - // newly generated coalesced request - val outReq = DecoupledIO(coalReqT.cloneType) + // generated coalesced request + val coalReq = DecoupledIO(coalReqT.cloneType) // invalidate signals going into each request queue's head val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W)))) }) @@ -467,20 +462,20 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE } val sourceGen = Module(new ReqSourceGen(log2Ceil(config.numNewSrcIds))) - sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created + sourceGen.io.gen := io.coalReq.fire // use up a source ID only when request is created val coalesceValid = chosenValid && sourceGen.io.id.valid - io.outReq.bits.source := sourceGen.io.id.bits - io.outReq.bits.mask := mask.asUInt - io.outReq.bits.data := data.asUInt - io.outReq.bits.size := chosenSize - io.outReq.bits.address := chosenBundle.baseAddr - io.outReq.bits.op := VecInit(io.window.map(_.elts.head))(chosenBundle.leaderIdx).op - io.outReq.valid := coalesceValid + io.coalReq.bits.source := sourceGen.io.id.bits + io.coalReq.bits.mask := mask.asUInt + io.coalReq.bits.data := data.asUInt + io.coalReq.bits.size := chosenSize + io.coalReq.bits.address := chosenBundle.baseAddr + io.coalReq.bits.op := VecInit(io.window.map(_.elts.head))(chosenBundle.leaderIdx).op + io.coalReq.valid := coalesceValid io.invalidate.bits := chosenBundle.matchOH - io.invalidate.valid := io.outReq.fire // invalidate only when fire + io.invalidate.valid := io.coalReq.fire // invalidate only when fire dontTouch(io.invalidate) // debug @@ -547,6 +542,8 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends reqQueue.io.queue.enq.bits := req // TODO: deq.ready should respect downstream ready reqQueue.io.queue.deq.ready := true.B + // invalidate queue entries that contain original core requests that got + // coalesced into a wider one reqQueue.io.invalidate.bits := coalescer.io.invalidate.bits(lane) reqQueue.io.invalidate.valid := coalescer.io.invalidate.valid @@ -556,9 +553,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends val (tlCoal, edgeCoal) = outer.coalescerNode.out(0) - tlCoal.a.valid := coalescer.io.outReq.valid - tlCoal.a.bits := coalescer.io.outReq.bits.toTLA(edgeCoal) - coalescer.io.outReq.ready := tlCoal.a.ready + tlCoal.a.valid := coalescer.io.coalReq.valid + tlCoal.a.bits := coalescer.io.coalReq.bits.toTLA(edgeCoal) + coalescer.io.coalReq.ready := tlCoal.a.ready tlCoal.b.ready := true.B tlCoal.c.valid := false.B // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below. @@ -692,36 +689,50 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends ) println(s"=========== table sourceWidth: ${sourceWidth}") // println(s"=========== table sizeEnumBits: ${newEntry.sizeEnumBits}") - newEntry.source := coalescer.io.outReq.bits.source + newEntry.source := coalescer.io.coalReq.bits.source // TODO: richard to write table fill logic assert (config.maxCoalLogSize <= config.dataBusWidth, "multi-beat coalesced reads/writes are currently not supported") assert ( tlCoal.params.dataBits == (1 << config.dataBusWidth) * 8, - s"tlCoal param dataBits (${tlCoal.params.dataBits}) mismatch coalescer constant" + s"tlCoal param `dataBits` (${tlCoal.params.dataBits}) mismatches coalescer constant" + s" (${(1 << config.dataBusWidth) * 8})" ) - val origReqs = reqQueues.map(q => q.io.queue.deq.bits) - newEntry.lanes.foreach { l => - l.reqs.zipWithIndex.foreach { case (r, i) => - // TODO: this part needs the actual coalescing logic to work - r.valid := false.B - r.source := origReqs(i).source - r.offset := (origReqs(i).address % (1 << config.maxCoalLogSize).U) >> config.wordWidth - r.sizeEnum := config.sizeEnum.logSizeToEnum(origReqs(i).size) + val reqQueueHeads = reqQueues.map(q => q.io.queue.deq.bits) + // newEntry.lanes.foreach { l => + // l.reqs.zipWithIndex.foreach { case (r, i) => + // // TODO: this part needs the actual coalescing logic to work + // r.valid := false.B + // r.source := origReqs(i).source + // r.offset := (origReqs(i).address % (1 << config.maxCoalLogSize).U) >> config.wordWidth + // r.sizeEnum := config.sizeEnum.logSizeToEnum(origReqs(i).size) + // } + // } + // newEntry.lanes(0).reqs(0).valid := true.B + // newEntry.lanes(1).reqs(0).valid := true.B + // newEntry.lanes(2).reqs(0).valid := true.B + // newEntry.lanes(3).reqs(0).valid := true.B + (newEntry.lanes zip coalescer.io.invalidate.bits).zipWithIndex + .foreach { case ((laneEntry, laneInv), lane) => + (laneEntry.reqs zip laneInv.asBools).foreach { case (reqEntry, inv) => + // TODO: this part needs the actual coalescing logic to work + reqEntry.valid := inv + when (inv) { + printf(s"entry for reqQueue(${lane}) got invalidated\n") + } + // FIXME: copying over queue heads out of laziness + reqEntry.source := reqQueueHeads(lane).source + reqEntry.offset := (reqQueueHeads(lane).address % (1 << config.maxCoalLogSize).U) >> config.wordWidth + reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(reqQueueHeads(lane).size) + } } - } - newEntry.lanes(0).reqs(0).valid := true.B - newEntry.lanes(1).reqs(0).valid := true.B - newEntry.lanes(2).reqs(0).valid := true.B - newEntry.lanes(3).reqs(0).valid := true.B dontTouch(newEntry) // Uncoalescer module uncoalesces responses back to each lane val uncoalescer = Module(new UncoalescingUnit(config)) - uncoalescer.io.coalReqValid := coalescer.io.outReq.valid + uncoalescer.io.coalReqValid := coalescer.io.coalReq.valid uncoalescer.io.newEntry := newEntry // Cleanup: custom <>? uncoalescer.io.coalResp.valid := tlCoal.d.valid @@ -730,13 +741,13 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends tlCoal.d.ready := uncoalescer.io.coalResp.ready // Queue up synthesized uncoalesced responses into each lane's response queue - (respQueues zip uncoalescer.io.uncoalResps).foreach { case (q, lanes) => - lanes.zipWithIndex.foreach { case (resp, i) => + (respQueues zip uncoalescer.io.uncoalResps).zipWithIndex.foreach { case ((q, perLaneResps), lane) => + perLaneResps.zipWithIndex.foreach { case (resp, i) => // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream // cache. This should ideally not happen though. assert( q.io.enq(respQueueCoalPortOffset + i).ready, - s"respQueue: enq port for 0-th coalesced response is blocked" + s"respQueue: enq port for coalesced response is blocked for lane ${lane}" ) q.io.enq(respQueueCoalPortOffset + i).valid := resp.valid q.io.enq(respQueueCoalPortOffset + i).bits := resp.bits @@ -745,7 +756,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends } // Debug - dontTouch(coalescer.io.outReq) + dontTouch(coalescer.io.coalReq) val coalRespData = tlCoal.d.bits.data dontTouch(coalRespData) From edc05d51e69e7c67fa7bf8dc1a503c594eb35031 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 21:35:26 -0700 Subject: [PATCH 07/17] Fix not respecting invalidate.valid from coalescer --- src/main/scala/tilelink/Coalescing.scala | 25 +++++++++++-------- .../scala/coalescing/CoalescingUnitTest.scala | 4 +-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 01b0ff0..fc87ad2 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -581,6 +581,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // requests that didn't get coalesced, and M is the maximum number of // single-lane requests that can go into a coalesced request. // (`numPerLaneReqs`). + // TODO: potentially expensive, because this generates more FFs. + // Rather than enqueueing all responses in a single cycle, consider + // enqueueing one by one (at the cost of possibly stalling downstream). 1 + numPerLaneReqs, // deq_lanes = 1 because we're serializing all responses to 1 port that // goes back to the core. @@ -597,7 +600,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends ) } val respQueueNoncoalPort = 0 - val respQueueCoalPortOffset = 1 + val respQueueUncoalPortOffset = 1 (outer.node.in zip outer.node.out).zipWithIndex.foreach { case (((tlIn, edgeIn), (tlOut, _)), 0) => // TODO: not necessarily 1 master edge @@ -716,9 +719,8 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends (newEntry.lanes zip coalescer.io.invalidate.bits).zipWithIndex .foreach { case ((laneEntry, laneInv), lane) => (laneEntry.reqs zip laneInv.asBools).foreach { case (reqEntry, inv) => - // TODO: this part needs the actual coalescing logic to work - reqEntry.valid := inv - when (inv) { + reqEntry.valid := (coalescer.io.invalidate.valid && inv) + when ((coalescer.io.invalidate.valid && inv)) { printf(s"entry for reqQueue(${lane}) got invalidated\n") } // FIXME: copying over queue heads out of laziness @@ -730,7 +732,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends dontTouch(newEntry) // Uncoalescer module uncoalesces responses back to each lane - val uncoalescer = Module(new UncoalescingUnit(config)) + val uncoalescer = Module(new Uncoalescer(config)) uncoalescer.io.coalReqValid := coalescer.io.coalReq.valid uncoalescer.io.newEntry := newEntry @@ -746,11 +748,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // TODO: rather than crashing, deassert tlOut.d.ready to stall downtream // cache. This should ideally not happen though. assert( - q.io.enq(respQueueCoalPortOffset + i).ready, - s"respQueue: enq port for coalesced response is blocked for lane ${lane}" + q.io.enq(respQueueUncoalPortOffset + i).ready, + s"respQueue: enq port for ${i}-th uncoalesced response is blocked for lane ${lane}" ) - q.io.enq(respQueueCoalPortOffset + i).valid := resp.valid - q.io.enq(respQueueCoalPortOffset + i).bits := resp.bits + q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid + q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits + when (resp.valid) { + printf(s"${i}-th uncoalesced response came back from lane ${lane}\n") + } // dontTouch(q.io.enq(respQueueCoalPortOffset)) } } @@ -776,7 +781,7 @@ class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle { val data = UInt((8 * (1 << config.maxCoalLogSize)).W) } -class UncoalescingUnit(config: CoalescerConfig) extends Module { +class Uncoalescer(config: CoalescerConfig) extends Module { // notes to hansung: // val numLanes: Int, <-> config.NUM_LANES // val numPerLaneReqs: Int, <-> config.DEPTH diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 799a164..3a2e4a5 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -440,7 +440,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } -class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { +class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "uncoalescer" val numLanes = 4 val numPerLaneReqs = 2 @@ -451,7 +451,7 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { val numInflightCoalRequests = 4 it should "work" in { - test(new UncoalescingUnit(testConfig)) + test(new Uncoalescer(testConfig)) // vcs helps with simulation time, but sometimes errors with // "mutation occurred during iteration" java error // .withAnnotations(Seq(VcsBackendAnnotation)) From b5b1a7da666ae751a4627e664e69228682eb5d69 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 21:53:36 -0700 Subject: [PATCH 08/17] Add uncoalescer test case for all-lane-same-offset case --- .../scala/coalescing/CoalescingUnitTest.scala | 85 ++++++++++++++++++- 1 file changed, 81 insertions(+), 4 deletions(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 3a2e4a5..b97e5a5 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -440,6 +440,22 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } +object uncoalescerTestConfig extends CoalescerConfig( + numLanes = 4, + queueDepth = 2, + waitTimeout = 8, + addressWidth = 24, + dataBusWidth = 5, + // watermark = 2, + wordSizeInBytes = 4, + wordWidth = 2, + numOldSrcIds = 16, + numNewSrcIds = 4, + respQueueDepth = 4, + coalLogSizes = Seq(4), + sizeEnum = DefaultInFlightTableSizeEnum +) + class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "uncoalescer" val numLanes = 4 @@ -450,8 +466,8 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { val coalDataWidth = 128 val numInflightCoalRequests = 4 - it should "work" in { - test(new Uncoalescer(testConfig)) + it should "work in general case" in { + test(new Uncoalescer(uncoalescerTestConfig)) // vcs helps with simulation time, but sometimes errors with // "mutation occurred during iteration" java error // .withAnnotations(Seq(VcsBackendAnnotation)) @@ -466,7 +482,7 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four) c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B) c.io.newEntry.lanes(0).reqs(1).source.poke(2.U) - c.io.newEntry.lanes(0).reqs(1).offset.poke(0.U) + c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) // same offset to different lanes c.io.newEntry.lanes(0).reqs(1).sizeEnum.poke(four) c.io.newEntry.lanes(1).reqs(0).valid.poke(false.B) c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B) @@ -500,7 +516,7 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { // offset is counting from LSB c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U) c.io.uncoalResps(0)(0).bits.source.expect(1.U) - c.io.uncoalResps(0)(1).bits.data.expect(0xdeadbeefL.U) + c.io.uncoalResps(0)(1).bits.data.expect(0x5ca1ab1eL.U) c.io.uncoalResps(0)(1).bits.source.expect(2.U) c.io.uncoalResps(2)(0).bits.data.expect(0x89abcdefL.U) c.io.uncoalResps(2)(0).bits.source.expect(2.U) @@ -508,6 +524,67 @@ class UncoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { c.io.uncoalResps(2)(1).bits.source.expect(2.U) } } + + it should "uncoalesce when coalesced to the same word offset" in { + test(new Uncoalescer(uncoalescerTestConfig)) + // .withAnnotations(Seq(VcsBackendAnnotation)) + { c => + val sourceId = 0.U + val four = c.io.newEntry.sizeEnumT.FOUR + c.io.coalReqValid.poke(true.B) + c.io.newEntry.source.poke(sourceId) + c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B) + c.io.newEntry.lanes(0).reqs(0).source.poke(0.U) + c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U) + c.io.newEntry.lanes(0).reqs(0).sizeEnum.poke(four) + c.io.newEntry.lanes(0).reqs(1).valid.poke(false.B) + c.io.newEntry.lanes(1).reqs(0).valid.poke(true.B) + c.io.newEntry.lanes(1).reqs(0).source.poke(1.U) + c.io.newEntry.lanes(1).reqs(0).offset.poke(1.U) + c.io.newEntry.lanes(1).reqs(0).sizeEnum.poke(four) + c.io.newEntry.lanes(1).reqs(1).valid.poke(false.B) + c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B) + c.io.newEntry.lanes(2).reqs(0).source.poke(2.U) + c.io.newEntry.lanes(2).reqs(0).offset.poke(1.U) + c.io.newEntry.lanes(2).reqs(0).sizeEnum.poke(four) + c.io.newEntry.lanes(2).reqs(1).valid.poke(false.B) + c.io.newEntry.lanes(3).reqs(0).valid.poke(true.B) + c.io.newEntry.lanes(3).reqs(0).source.poke(3.U) + c.io.newEntry.lanes(3).reqs(0).offset.poke(1.U) + c.io.newEntry.lanes(3).reqs(0).sizeEnum.poke(four) + c.io.newEntry.lanes(3).reqs(1).valid.poke(false.B) + + c.clock.step() + + c.io.coalReqValid.poke(false.B) + + c.clock.step() + + c.io.coalResp.valid.poke(true.B) + c.io.coalResp.bits.source.poke(sourceId) + val lit = (BigInt(0x0123456789abcdefL) << 64) | BigInt(0x5ca1ab1edeadbeefL) + c.io.coalResp.bits.data.poke(lit.U) + + // table lookup is combinational at the same cycle + // offset is counting from LSB + c.io.uncoalResps(0)(0).valid.expect(true.B) + c.io.uncoalResps(0)(0).bits.data.expect(0x5ca1ab1eL.U) + c.io.uncoalResps(0)(0).bits.source.expect(0.U) + c.io.uncoalResps(0)(1).valid.expect(false.B) + c.io.uncoalResps(1)(0).valid.expect(true.B) + c.io.uncoalResps(1)(0).bits.data.expect(0x5ca1ab1eL.U) + c.io.uncoalResps(1)(0).bits.source.expect(1.U) + c.io.uncoalResps(1)(1).valid.expect(false.B) + c.io.uncoalResps(2)(0).valid.expect(true.B) + c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U) + c.io.uncoalResps(2)(0).bits.source.expect(2.U) + c.io.uncoalResps(2)(1).valid.expect(false.B) + c.io.uncoalResps(3)(0).valid.expect(true.B) + c.io.uncoalResps(3)(0).bits.data.expect(0x5ca1ab1eL.U) + c.io.uncoalResps(3)(0).bits.source.expect(3.U) + c.io.uncoalResps(3)(1).valid.expect(false.B) + } + } } class CoalInflightTableUnitTest extends AnyFlatSpec with ChiselScalatestTester { From 16c8eb2ada5d5bf9f458a4455d394254fa31bc17 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 27 Apr 2023 21:54:02 -0700 Subject: [PATCH 09/17] Fix old request not being invalidated; end-to-end flow working --- src/main/scala/tilelink/Coalescing.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index fc87ad2..4fad0e8 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -221,7 +221,7 @@ class CoalShiftQueue[T <: Data]( io.queue.enq.ready := !valid(entries - 1) // TODO: making this validAfterInv(0) might be useful for the arbiter - io.queue.deq.valid := valid(0) + io.queue.deq.valid := validAfterInv(0) io.queue.deq.bits := elts.head assert(!flow, "flow-through is not implemented") From ec5c6ce08b6892a82a39ee3327d64555f66c4e7a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 28 Apr 2023 00:50:19 -0700 Subject: [PATCH 10/17] Properly copy over queue data to inflight entry --- src/main/scala/tilelink/Coalescing.scala | 44 ++++++++++-------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4fad0e8..067ff0d 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -220,7 +220,6 @@ class CoalShiftQueue[T <: Data]( } io.queue.enq.ready := !valid(entries - 1) - // TODO: making this validAfterInv(0) might be useful for the arbiter io.queue.deq.valid := validAfterInv(0) io.queue.deq.bits := elts.head @@ -338,9 +337,9 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // debug prints when (leadersValid.reduce(_ || _)) { - matchCounts.zipWithIndex.foreach { case (count, i) => - printf(s"lane[${i}] matchCount = %d\n", count); - } + // matchCounts.zipWithIndex.foreach { case (count, i) => + // printf(s"lane[${i}] matchCount = %d\n", count); + // } printf("chosenLeader = lane %d\n", chosenLeaderIdx) printf("chosenLeader matches = [ ") chosenMatches.foreach { m => @@ -547,6 +546,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends reqQueue.io.invalidate.bits := coalescer.io.invalidate.bits(lane) reqQueue.io.invalidate.valid := coalescer.io.invalidate.valid + // NOTE: this relies on CoalShiftQueue's behavior combinationally + // deasserting deq.valid in the same cycle that the head invalidate + // signal goes up. tlOut.a.valid := reqQueue.io.queue.deq.valid tlOut.a.bits := reqQueue.io.queue.deq.bits.toTLA(edgeOut) } @@ -703,31 +705,21 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends + s" (${(1 << config.dataBusWidth) * 8})" ) val reqQueueHeads = reqQueues.map(q => q.io.queue.deq.bits) - // newEntry.lanes.foreach { l => - // l.reqs.zipWithIndex.foreach { case (r, i) => - // // TODO: this part needs the actual coalescing logic to work - // r.valid := false.B - // r.source := origReqs(i).source - // r.offset := (origReqs(i).address % (1 << config.maxCoalLogSize).U) >> config.wordWidth - // r.sizeEnum := config.sizeEnum.logSizeToEnum(origReqs(i).size) - // } - // } - // newEntry.lanes(0).reqs(0).valid := true.B - // newEntry.lanes(1).reqs(0).valid := true.B - // newEntry.lanes(2).reqs(0).valid := true.B - // newEntry.lanes(3).reqs(0).valid := true.B + // Do a 2-D copy from every (numLanes * queueDepth) invalidate output of the + // coalescer to every (numLanes * queueDepth) entry in the inflight table. (newEntry.lanes zip coalescer.io.invalidate.bits).zipWithIndex .foreach { case ((laneEntry, laneInv), lane) => - (laneEntry.reqs zip laneInv.asBools).foreach { case (reqEntry, inv) => - reqEntry.valid := (coalescer.io.invalidate.valid && inv) - when ((coalescer.io.invalidate.valid && inv)) { - printf(s"entry for reqQueue(${lane}) got invalidated\n") + (laneEntry.reqs zip laneInv.asBools).zipWithIndex + .foreach { case ((reqEntry, inv), i) => + val req = reqQueues(lane).io.elts(i) + when ((coalescer.io.invalidate.valid && inv)) { + printf(s"coalescer: reqQueue(${lane})(${i}) got invalidated (source=%d)\n", req.source) + } + reqEntry.valid := (coalescer.io.invalidate.valid && inv) + reqEntry.source := req.source + reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordWidth) + reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size) } - // FIXME: copying over queue heads out of laziness - reqEntry.source := reqQueueHeads(lane).source - reqEntry.offset := (reqQueueHeads(lane).address % (1 << config.maxCoalLogSize).U) >> config.wordWidth - reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(reqQueueHeads(lane).size) - } } dontTouch(newEntry) From f7bf277e899609a2ff1d97c64df0731dfd3c5c26 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 28 Apr 2023 00:50:34 -0700 Subject: [PATCH 11/17] Fix unittest for CoalShiftQueue deq.valid change --- src/test/scala/coalescing/CoalescingUnitTest.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index b97e5a5..7469e59 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -357,12 +357,11 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { c.io.queue.enq.valid.poke(false.B) // invalidate should work for the head just being dequeued at the same - // cycle. However, it should not change deq.valid right away to avoid - // combinational cycles (see definition). + // cycle c.io.invalidate.valid.poke(true.B) c.io.invalidate.bits.poke(0x1.U) c.io.queue.deq.ready.poke(true.B) - c.io.queue.deq.valid.expect(true.B) + c.io.queue.deq.valid.expect(false.B) c.clock.step() // 0x12 should have been dequeued c.io.invalidate.valid.poke(false.B) From f12211b9cc2f90a518bedc2f5b4f56644116025c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 28 Apr 2023 01:15:37 -0700 Subject: [PATCH 12/17] Remove duplicate instantiation of table entryT --- src/main/scala/tilelink/Coalescing.scala | 34 +++++++++--------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 067ff0d..a5c4f24 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -59,7 +59,6 @@ case class CoalescerConfig( object defaultConfig extends CoalescerConfig( numLanes = 4, - // TODO: bigger size queueDepth = 1, waitTimeout = 8, addressWidth = 24, @@ -681,19 +680,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends // logic to generate the Inflight Entry into the uncoalescer, where it should be. // this also reduces top level clutter. - val offsetBits = 4 // FIXME hardcoded - // but the width of the size enum - val newEntry = Wire( - new InflightCoalReqTableEntry( - config.numLanes, - numPerLaneReqs, - sourceWidth, - offsetBits, - config.sizeEnum - ) - ) - println(s"=========== table sourceWidth: ${sourceWidth}") - // println(s"=========== table sizeEnumBits: ${newEntry.sizeEnumBits}") + val uncoalescer = Module(new Uncoalescer(config)) + + val newEntry = Wire(uncoalescer.inflightTable.entryT) newEntry.source := coalescer.io.coalReq.bits.source // TODO: richard to write table fill logic @@ -719,13 +708,11 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends reqEntry.source := req.source reqEntry.offset := ((req.address % (1 << config.maxCoalLogSize).U) >> config.wordWidth) reqEntry.sizeEnum := config.sizeEnum.logSizeToEnum(req.size) + // TODO: load/store op } } dontTouch(newEntry) - // Uncoalescer module uncoalesces responses back to each lane - val uncoalescer = Module(new Uncoalescer(config)) - uncoalescer.io.coalReqValid := coalescer.io.coalReq.valid uncoalescer.io.newEntry := newEntry // Cleanup: custom <>? @@ -745,9 +732,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends ) q.io.enq(respQueueUncoalPortOffset + i).valid := resp.valid q.io.enq(respQueueUncoalPortOffset + i).bits := resp.bits - when (resp.valid) { - printf(s"${i}-th uncoalesced response came back from lane ${lane}\n") - } + // debug + // when (resp.valid) { + // printf(s"${i}-th uncoalesced response came back from lane ${lane}\n") + // } // dontTouch(q.io.enq(respQueueCoalPortOffset)) } } @@ -873,8 +861,7 @@ class Uncoalescer(config: CoalescerConfig) extends Module { // split the coalesced response back to individual per-lane responses with the // right metadata. class InflightCoalReqTable(config: CoalescerConfig) extends Module { - val offsetBits = 4 // FIXME hardcoded - val sizeBits = 2 // FIXME hardcoded + val offsetBits = config.maxCoalLogSize - config.wordWidth // assumes word offset val entryT = new InflightCoalReqTableEntry( config.numLanes, config.queueDepth, @@ -886,6 +873,9 @@ class InflightCoalReqTable(config: CoalescerConfig) extends Module { val entries = config.numNewSrcIds val sourceWidth = log2Ceil(config.numOldSrcIds) + println(s"=========== table sourceWidth: ${sourceWidth}") + println(s"=========== table sizeEnumBits: ${entryT.sizeEnumT.getWidth}") + val io = IO(new Bundle { val enq = Flipped(Decoupled(entryT)) // TODO: return actual stuff From fc948b36d857f7d9a4af08be58935fe22b630da6 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 28 Apr 2023 01:37:36 -0700 Subject: [PATCH 13/17] Only compare leader against followers >= leader idx --- src/main/scala/tilelink/Coalescing.scala | 28 +++++++++++++----------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index a5c4f24..135dfa0 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -274,11 +274,9 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], leadersValid(i), head.source, head.address) } } - - // debug assertions and prints when (leadersValid.reduce(_ || _)) { assert(testNoQueueDrift, "unexpected drift between lane request queues") - printQueueHeads + // printQueueHeads } val size = coalLogSize @@ -291,15 +289,19 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // Gives a 2-D table of Bools representing match at every queue entry, // for each lane (so 3-D in total). - val matchTablePerLane = (leaders zip leadersValid).map { case (leader, leaderValid) => - // TODO: match leader to only lanes >= leader idx - io.window.map { followerLane => - // compare leader's head against follower's every queue entry - (followerLane.elts zip followerLane.mask.asBools).map { case (follower, followerValid) => - canMatch(follower, followerValid, leader, leaderValid) + val matchTablePerLane = (leaders zip leadersValid).zipWithIndex + .map { case ((leader, leaderValid), leaderIndex) => + io.window.zipWithIndex.map { case (followerQueue, followerIndex) => + // compare leader's head against follower's every queue entry + (followerQueue.elts zip followerQueue.mask.asBools) + .map { case (follower, followerValid) => + // match leader to only followers at lanes >= leader idx + // this halves the number of comparators + if (followerIndex < leaderIndex) false.B + else canMatch(follower, followerValid, leader, leaderValid) + } } } - } // TODO: potentially expensive: popcount & adder val matchCounts = matchTablePerLane.map(table => @@ -336,9 +338,9 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // debug prints when (leadersValid.reduce(_ || _)) { - // matchCounts.zipWithIndex.foreach { case (count, i) => - // printf(s"lane[${i}] matchCount = %d\n", count); - // } + matchCounts.zipWithIndex.foreach { case (count, i) => + printf(s"lane[${i}] matchCount = %d\n", count); + } printf("chosenLeader = lane %d\n", chosenLeaderIdx) printf("chosenLeader matches = [ ") chosenMatches.foreach { m => From 2622bf04d3567416cbf9dc6a0767f61fd04fb344 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 28 Apr 2023 14:12:29 -0700 Subject: [PATCH 14/17] Add allowShift to CoalShiftQueue IO to synchronize shifting --- src/main/scala/tilelink/Coalescing.scala | 16 +++++++++++----- .../scala/coalescing/CoalescingUnitTest.scala | 6 ++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 135dfa0..0cb7c91 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -153,10 +153,14 @@ class ReqSourceGen(sourceWidth: Int) extends Module { // A shift-register queue implementation that supports invalidating entries // and exposing queue contents as output IO. (TODO: support deadline) // Initially copied from freechips.rocketchip.util.ShiftQueue. -// If `pipe` is true, support enqueueing to a full queue when also dequeueing. +// The queue only shifts down when `allowShift` is given true. Dequeueing +// works normally, but if allowShift was false, the queue head will stay +// invalid after dequeueing. This option is added in order to synchronize the +// shifting of the queues between lanes to model the SIMD behavior. +// If `pipe` is true, support enqueueing to a full queue when head is being +// dequeued at the next cycle. // Software model: window.py -class CoalShiftQueue[T <: Data]( - gen: T, +class CoalShiftQueue[T <: Data]( gen: T, val entries: Int, pipe: Boolean = true, flow: Boolean = false @@ -164,6 +168,7 @@ class CoalShiftQueue[T <: Data]( val io = IO(new Bundle { val queue = new QueueIO(gen, entries) val invalidate = Input(Valid(UInt(entries.W))) + val allowShift = Input(Bool()) val mask = Output(UInt(entries.W)) val elts = Output(Vec(entries, gen)) // 'QueueIO' provides io.count, but we might not want to use it in the @@ -192,7 +197,7 @@ class CoalShiftQueue[T <: Data]( def paddedUsed = pad({ i: Int => used(i) }) def validAfterInv(i: Int) = valid(i) && (!io.invalidate.valid || !io.invalidate.bits(i)) - val shift = (used =/= 0.U) && (io.queue.deq.ready || !validAfterInv(0)) + val shift = io.allowShift && (used =/= 0.U) && (io.queue.deq.fire || !validAfterInv(0)) for (i <- 0 until entries) { val wdata = if (i == entries - 1) io.queue.enq.bits else Mux(!used(i + 1), io.queue.enq.bits, elts(i + 1)) val wen = Mux( @@ -540,12 +545,13 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends assert(reqQueue.io.queue.enq.ready, "reqQueue is supposed to be always ready") reqQueue.io.queue.enq.valid := tlIn.a.valid reqQueue.io.queue.enq.bits := req - // TODO: deq.ready should respect downstream ready + // TODO: deq.ready should respect downstream arbiter reqQueue.io.queue.deq.ready := true.B // invalidate queue entries that contain original core requests that got // coalesced into a wider one reqQueue.io.invalidate.bits := coalescer.io.invalidate.bits(lane) reqQueue.io.invalidate.valid := coalescer.io.invalidate.valid + reqQueue.io.allowShift := true.B // NOTE: this relies on CoalShiftQueue's behavior combinationally // deasserting deq.valid in the same cycle that the head invalidate diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 7469e59..66f6b9f 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -224,6 +224,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { it should "work like normal shiftqueue when no invalidate" in { test(new CoalShiftQueue(UInt(8.W), 4)) { c => c.io.queue.deq.ready.poke(false.B) + c.io.allowShift.poke(true.B) c.io.queue.enq.ready.expect(true.B) c.io.queue.enq.valid.poke(true.B) @@ -272,6 +273,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { it should "work when enqueing and dequeueing simultaneously" in { test(new CoalShiftQueue(UInt(8.W), 4)) { c => c.io.invalidate.valid.poke(false.B) + c.io.allowShift.poke(true.B) // prepare c.io.queue.deq.ready.poke(true.B) @@ -303,6 +305,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { it should "work when enqueing and dequeueing simultaneously to a full queue" in { test(new CoalShiftQueue(UInt(8.W), 1)) { c => c.io.invalidate.valid.poke(false.B) + c.io.allowShift.poke(true.B) // prepare c.io.queue.deq.ready.poke(true.B) @@ -342,6 +345,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { it should "invalidate head being dequeued" in { test(new CoalShiftQueue(UInt(8.W), 4)) { c => c.io.invalidate.valid.poke(false.B) + c.io.allowShift.poke(true.B) // prepare c.io.queue.deq.ready.poke(false.B) @@ -374,6 +378,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { it should "dequeue invalidated entries by itself" in { test(new CoalShiftQueue(gen = UInt(8.W), entries = 4)) { c => c.io.invalidate.valid.poke(false.B) + c.io.allowShift.poke(true.B) // prepare c.io.queue.deq.ready.poke(false.B) @@ -414,6 +419,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { test(new CoalShiftQueue(UInt(8.W), 4)) { c => c.io.invalidate.valid.poke(false.B) c.io.invalidate.bits.poke(0.U) + c.io.allowShift.poke(true.B) // prepare c.io.queue.deq.ready.poke(false.B) From 44d3c09b6d11c13f32a52350c83f8ea75dae5eed Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 28 Apr 2023 14:58:47 -0700 Subject: [PATCH 15/17] Fix used bit logic when invalidating but not dequeueing --- src/main/scala/tilelink/Coalescing.scala | 5 +++-- .../scala/coalescing/CoalescingUnitTest.scala | 21 ++++++++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 0cb7c91..2590884 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -216,10 +216,10 @@ class CoalShiftQueue[T <: Data]( gen: T, } when(io.queue.enq.fire) { - when(!io.queue.deq.fire) { + when(!shift) { used := (used << 1.U) | 1.U } - }.elsewhen(io.queue.deq.fire) { + }.elsewhen(shift) { used := used >> 1.U } @@ -229,6 +229,7 @@ class CoalShiftQueue[T <: Data]( gen: T, assert(!flow, "flow-through is not implemented") if (flow) { + // FIXME old code when(io.queue.enq.valid) { io.queue.deq.valid := true.B } when(!valid(0)) { io.queue.deq.bits := io.queue.enq.bits } } diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 66f6b9f..41017e5 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -375,9 +375,10 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } - it should "dequeue invalidated entries by itself" in { + it should "dequeue invalidated head on its own when allowShift" in { test(new CoalShiftQueue(gen = UInt(8.W), entries = 4)) { c => c.io.invalidate.valid.poke(false.B) + c.io.allowShift.poke(true.B) // prepare @@ -399,19 +400,33 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { // invalidate two entries at head c.io.invalidate.valid.poke(true.B) c.io.invalidate.bits.poke(0x3.U) + c.io.queue.deq.ready.poke(false.B) // [ 0x56 | 0x34(inv) | 0x12(inv) ] c.clock.step() - // [ 0x56 | 0x34(inv) ] + // [ 0x56 | 0x34(inv) ] c.io.invalidate.valid.poke(false.B) c.io.queue.deq.ready.poke(false.B) c.clock.step() - // [ 0x56 ] + // [ 0x56 ] c.io.queue.deq.ready.poke(true.B) c.io.queue.deq.valid.expect(true.B) c.io.queue.deq.bits.expect(0x56.U) c.clock.step() c.io.queue.deq.ready.poke(true.B) c.io.queue.deq.valid.expect(false.B) + c.clock.step() + + // do one more enqueue-then-dequeue to see if used bit was properly cleared + c.io.queue.deq.ready.poke(false.B) + c.io.queue.enq.ready.expect(true.B) + c.io.queue.enq.valid.poke(true.B) + c.io.queue.enq.bits.poke(0x78.U) + c.clock.step() + // should dequeue right away + c.io.queue.enq.valid.poke(false.B) + c.io.queue.deq.ready.poke(true.B) + c.io.queue.deq.valid.expect(true.B) + c.io.queue.deq.bits.expect(0x78.U) } } From a49931ae60b3edc6be78d6828f0cfc1fc8c041e3 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 28 Apr 2023 15:08:39 -0700 Subject: [PATCH 16/17] Add invalidate/enq test case for depth=1 CoalShiftQueue --- .../scala/coalescing/CoalescingUnitTest.scala | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 41017e5..20b95c1 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -302,7 +302,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } - it should "work when enqueing and dequeueing simultaneously to a full queue" in { + it should "work when enqueing and dequeueing simultaneously to a depth=1 queue" in { test(new CoalShiftQueue(UInt(8.W), 1)) { c => c.io.invalidate.valid.poke(false.B) c.io.allowShift.poke(true.B) @@ -342,6 +342,43 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } + it should "work when invalidating and enqueueing to a depth=1 queue" in { + test(new CoalShiftQueue(UInt(8.W), 1)) { c => + c.io.invalidate.valid.poke(false.B) + c.io.allowShift.poke(true.B) + // no dequeueing + c.io.queue.deq.ready.poke(false.B) + + // prepare + c.io.queue.enq.ready.expect(true.B) + c.io.queue.enq.valid.poke(true.B) + c.io.queue.enq.bits.poke(0x12.U) + c.clock.step() + // invalidate, but don't allow shift + c.io.allowShift.poke(false.B) + c.io.invalidate.valid.poke(true.B) + c.io.invalidate.bits.poke(0x1.U) + // TODO: we might be able to enqueue to a depth=1 queue whose entry + // just got invalidated, so that enq.ready is true.B here, but it is a + // niche case + c.io.queue.enq.ready.expect(false.B) + c.clock.step() + // now try enqueueing now that we have space + c.io.allowShift.poke(true.B) + c.io.invalidate.valid.poke(false.B) + c.io.queue.enq.ready.expect(true.B) + c.io.queue.enq.valid.poke(true.B) + c.io.queue.enq.bits.poke(0x34.U) + c.io.queue.deq.valid.expect(false.B) + c.clock.step() + // see if it comes out right next cycle + c.io.queue.enq.valid.poke(false.B) + c.io.queue.deq.ready.poke(true.B) + c.io.queue.deq.valid.expect(true.B) + c.io.queue.deq.bits.expect(0x34.U) + } + } + it should "invalidate head being dequeued" in { test(new CoalShiftQueue(UInt(8.W), 4)) { c => c.io.invalidate.valid.poke(false.B) From fec788d648d20fbd983e69066c6c82cdf062bb12 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 28 Apr 2023 15:46:45 -0700 Subject: [PATCH 17/17] Invalidate head when dequeued but allowShift was false --- src/main/scala/tilelink/Coalescing.scala | 5 +++ .../scala/coalescing/CoalescingUnitTest.scala | 45 +++++++++++++++++-- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 2590884..56c99aa 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -213,6 +213,11 @@ class CoalShiftQueue[T <: Data]( gen: T, (io.queue.enq.fire && !paddedUsed(i + 1) && used(i)) || pad(validAfterInv)(i + 1), (io.queue.enq.fire && paddedUsed(i - 1) && !used(i)) || validAfterInv(i) ) + // additionally, head entry should get invalidated when dequeue fired + // but queue didn't shift (e.g. because allowShift was false) + when (io.queue.deq.fire && !shift) { + valid(0) := false.B + } } when(io.queue.enq.fire) { diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 20b95c1..49a5639 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -302,6 +302,43 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } + it should "not shift entries when allowShift is false" in { + test(new CoalShiftQueue(UInt(8.W), 4)) { c => + c.io.invalidate.valid.poke(false.B) + c.io.queue.deq.ready.poke(false.B) + + c.io.allowShift.poke(false.B) + + // prepare + c.io.queue.enq.ready.expect(true.B) + c.io.queue.enq.valid.poke(true.B) + c.io.queue.enq.bits.poke(0x12.U) + c.clock.step() + c.io.queue.enq.ready.expect(true.B) + c.io.queue.enq.valid.poke(true.B) + c.io.queue.enq.bits.poke(0x34.U) + c.clock.step() + c.io.queue.enq.valid.poke(false.B) + + // dequeueing should work normally when allowShift is false... + c.io.queue.deq.ready.poke(true.B) + c.io.queue.deq.valid.expect(true.B) + c.io.queue.deq.bits.expect(0x12.U) + c.clock.step() + // but should stop there and not dequeue the next entry + c.io.queue.deq.ready.poke(true.B) + c.io.queue.deq.valid.expect(false.B) + c.clock.step() + // when allowShift is back one, dequeueing should start working from next + // cycle + c.io.allowShift.poke(true.B) + c.clock.step() + c.io.queue.deq.ready.poke(true.B) + c.io.queue.deq.valid.expect(true.B) + c.io.queue.deq.bits.expect(0x34.U) + } + } + it should "work when enqueing and dequeueing simultaneously to a depth=1 queue" in { test(new CoalShiftQueue(UInt(8.W), 1)) { c => c.io.invalidate.valid.poke(false.B) @@ -358,9 +395,9 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { c.io.allowShift.poke(false.B) c.io.invalidate.valid.poke(true.B) c.io.invalidate.bits.poke(0x1.U) - // TODO: we might be able to enqueue to a depth=1 queue whose entry - // just got invalidated, so that enq.ready is true.B here, but it is a - // niche case + // TODO: we might be able to enqueue to a full depth=1 queue whose only + // entry just got invalidated, so that enq.ready is true here, but + // it is a niche case c.io.queue.enq.ready.expect(false.B) c.clock.step() // now try enqueueing now that we have space @@ -379,7 +416,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } - it should "invalidate head being dequeued" in { + it should "invalidate head that is also being dequeued" in { test(new CoalShiftQueue(UInt(8.W), 4)) { c => c.io.invalidate.valid.poke(false.B) c.io.allowShift.poke(true.B)