More comments & renames

2023-04-27 19:17:07 -07:00
parent 900f5adb20
commit 7780250c7a
2 changed files with 171 additions and 122 deletions
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -36,22 +36,22 @@ object DefaultInFlightTableSizeEnum extends InFlightTableSizeEnum {
 }

 case class CoalescerConfig(
-  numLanes: Int,        // number of lanes (or threads) in a warp
-  maxSize: Int,         // maximum burst size (64 bytes)
-  queueDepth: Int,      // request window per lane
-  waitTimeout: Int,     // max cycles to wait before forced fifo dequeue, per lane
-  addressWidth: Int,    // assume <= 32
-  dataBusWidth: Int,    // memory-side downstream TileLink data bus size
-                        // this has to be at least larger than the word size for
-                        // the coalescer to perform well
-  // watermark = 2,     // minimum buffer occupancy to start coalescing
-  wordSizeInBytes: Int, // 32-bit system
-  wordWidth: Int,       // log(WORD_SIZE)
-  numOldSrcIds: Int,    // num of outstanding requests per lane, from processor
-  numNewSrcIds: Int,    // num of outstanding coalesced requests
-  respQueueDepth: Int,  // depth of the response fifo queues
-  coalSizes: Seq[Int],  // list of coalescer sizes to try in the MonoCoalescers
-                        // must be power of 2's
+  numLanes: Int,          // number of lanes (or threads) in a warp
+  maxSize: Int,           // maximum burst size (64 bytes)
+  queueDepth: Int,        // request window per lane
+  waitTimeout: Int,       // max cycles to wait before forced fifo dequeue, per lane
+  addressWidth: Int,      // assume <= 32
+  dataBusWidth: Int,      // memory-side downstream TileLink data bus size
+                          // this has to be at least larger than the word size for
+                          // the coalescer to perform well
+  // watermark = 2,       // minimum buffer occupancy to start coalescing
+  wordSizeInBytes: Int,   // 32-bit system
+  wordWidth: Int,         // log(WORD_SIZE)
+  numOldSrcIds: Int,      // num of outstanding requests per lane, from processor
+  numNewSrcIds: Int,      // num of outstanding coalesced requests
+  respQueueDepth: Int,    // depth of the response fifo queues
+  coalLogSizes: Seq[Int], // list of coalescer sizes to try in the MonoCoalescers
+                          // each size is log(byteSize)
  sizeEnum: InFlightTableSizeEnum
 )

@@ -69,7 +69,7 @@ object defaultConfig extends CoalescerConfig(
  numOldSrcIds = 16,
  numNewSrcIds = 4,
  respQueueDepth = 4,
-  coalSizes = Seq(3),
+  coalLogSizes = Seq(3),
  sizeEnum = DefaultInFlightTableSizeEnum
 )

@@ -243,7 +243,7 @@ class CoalShiftQueue[T <: Data](
 }

 // Software model: coalescer.py
-class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
+class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
                    config: CoalescerConfig) extends Module {
  val io = IO(new Bundle {
    val window = Input(Vec(config.numLanes, windowT.io.cloneType))
@@ -251,6 +251,8 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
      val leaderIdx = Output(UInt(log2Ceil(config.numLanes).W))
      val baseAddr = Output(UInt(config.addressWidth.W))
      val matchOH = Output(Vec(config.numLanes, UInt(config.queueDepth.W)))
+      // number of entries matched with this leader lane's head.
+      // maximum is numLanes * queueDepth
      val matchCount = Output(UInt(log2Ceil(config.numLanes * config.queueDepth).W))
      val coverageHits = Output(UInt((1 << config.maxSize).W))
    })
@@ -284,7 +286,7 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
    printQueueHeads
  }

-  val size = coalSize
+  val size = coalLogSize
  val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
  def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = {
    (req0.op === req1.op) &&
@@ -323,6 +325,18 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
  })(chosenLeaderIdx)
  val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)

+  // coverage calculation
+  def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth)
+  // 2-D table flattened to 1-D
+  val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address)))
+  val valids = io.window.map(_.mask).flatMap(_.asBools)
+  val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target =>
+    // count if any of the queue entries accesses the given offset word of the
+    // coalesced chunk; if 1 for all offsets, we've reached 100% utilization
+    // of the coalesced data words
+    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+  }
+
  // debug prints
  when (leadersValid.reduce(_ || _)) {
    matchCounts.zipWithIndex.foreach { case (count, i) =>
@@ -334,14 +348,12 @@ class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
      printf("%d ", m)
    }
    printf("]\n")
-  }

-  // coverage calculation
-  def getOffsetSlice(addr: UInt) = addr(size - 1, config.wordWidth)
-  val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address)))
-  val valids = io.window.map(_.mask).flatMap(_.asBools)
-  val hits = Seq.tabulate(1 << (size - config.wordWidth)) { target =>
-    (offsets zip valids).map { case (offset, valid) => valid && (offset === target.U) }.reduce(_ || _)
+    printf("hits = [ ")
+    hits.foreach { m =>
+      printf("%d ", m)
+    }
+    printf("]\n")
  }

  io.results.leaderIdx := chosenLeaderIdx
@@ -356,16 +368,19 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
                     config: CoalescerConfig) extends Module {

  val io = IO(new Bundle {
+    // coalescing window, connected to the contents of the request queues
    val window = Input(Vec(config.numLanes, windowT.io.cloneType))
+    // newly generated coalesced request
    val outReq = DecoupledIO(coalReqT.cloneType)
+    // invalidate signals going into each request queue's head
    val invalidate = Output(Valid(Vec(config.numLanes, UInt(config.queueDepth.W))))
  })

-  val coalescers = config.coalSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
+  val coalescers = config.coalLogSizes.map(size => Module(new MonoCoalescer(size, windowT, config)))
  coalescers.foreach(_.io.window := io.window)

-  def normalize(x: Seq[UInt]): Seq[UInt] = {
-    x.zip(config.coalSizes).map { case (hits, size) =>
+  def normalize(valPerSize: Seq[UInt]): Seq[UInt] = {
+    (valPerSize zip config.coalLogSizes).map { case (hits, size) =>
      (hits << (config.maxSize - size).U).asUInt
    }
  }
@@ -378,27 +393,34 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
    }._2
  }

+  // normalize to maximum coalescing size so that we can do fair comparisons
+  // between coalescing results of different sizes
  val normalizedMatches = normalize(coalescers.map(_.io.results.matchCount))
  val normalizedHits = normalize(coalescers.map(_.io.results.coverageHits))

-  val chosenIdx = Wire(UInt(log2Ceil(config.coalSizes.size).W))
+  val chosenSizeIdx = Wire(UInt(log2Ceil(config.coalLogSizes.size).W))
  val chosenValid = Wire(Bool())
  // minimum 25% coverage
-  val minCoverage = 1.max(1 << (config.maxSize - 4))
+  val minCoverage = 1.max(1 << ((config.maxSize - 2) - 2))
+  printf("matchCount[0]=%d\n", coalescers(0).io.results.matchCount)
+  printf("normalizedMatches[0]=%d\n", normalizedMatches(0))
+  printf("coverageHits[0]=%d\n", coalescers(0).io.results.coverageHits)
+  printf("normalizedHits[0]=%d\n", normalizedHits(0))
+  printf("minCoverage=%d\n", minCoverage.U)
  when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
-    chosenIdx := argMax(normalizedHits)
+    chosenSizeIdx := argMax(normalizedHits)
    chosenValid := true.B
  }.elsewhen(normalizedMatches.map(_ > 1.U).reduce(_ || _)) {
-    chosenIdx := argMax(normalizedMatches)
+    chosenSizeIdx := argMax(normalizedMatches)
    chosenValid := true.B
  }.otherwise {
-    chosenIdx := DontCare
+    chosenSizeIdx := DontCare
    chosenValid := false.B
  }

  // create coalesced request
-  val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenIdx)
-  val chosenSize = VecInit(coalescers.map(_.size.U))(chosenIdx)
+  val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenSizeIdx)
+  val chosenSize = VecInit(coalescers.map(_.size.U))(chosenSizeIdx)

  // flatten requests and matches
  val flatReqs = io.window.flatMap(_.elts)
@@ -437,13 +459,18 @@ class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueE
  val sourceGen = Module(new ReqSourceGen(log2Ceil(config.numNewSrcIds)))
  sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created

+  val coalesceValid = chosenValid && sourceGen.io.id.valid
+  when (coalesceValid) {
+    printf("coalescing success!\n")
+  }
+
  io.outReq.bits.source := sourceGen.io.id.bits
  io.outReq.bits.mask := mask.asUInt
  io.outReq.bits.data := data.asUInt
  io.outReq.bits.size := chosenSize
  io.outReq.bits.address := chosenBundle.baseAddr
  io.outReq.bits.op := VecInit(io.window.map(_.elts.head))(chosenBundle.leaderIdx).op
-  io.outReq.valid := chosenValid && sourceGen.io.id.valid
+  io.outReq.valid := coalesceValid

  io.invalidate.bits := chosenBundle.matchOH
  io.invalidate.valid := io.outReq.fire // invalidate only when fire
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -35,28 +35,47 @@ class MultiPortQueueUnitTest extends AnyFlatSpec with ChiselScalatestTester {

 class DummyCoalescingUnitTB(implicit p: Parameters) extends LazyModule {
  val cpuNodes = Seq.tabulate(testConfig.numLanes) { _ =>
-    TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLClientParameters(
-      name = "processor-nodes",
-      sourceId = IdRange(0, testConfig.numOldSrcIds),
-      requestFifo = true,
-      visibility = Seq(AddressSet(0x0, 0xffffff))))))) // 24 bit address space (TODO probably use testConfig)
+    TLClientNode(
+      Seq(
+        TLMasterPortParameters.v1(
+          Seq(
+            TLClientParameters(
+              name = "processor-nodes",
+              sourceId = IdRange(0, testConfig.numOldSrcIds),
+              requestFifo = true,
+              visibility = Seq(AddressSet(0x0, 0xffffff))
+            )
+          )
+        )
+      )
+    ) // 24 bit address space (TODO probably use testConfig)
  }

  val device = new SimpleDevice("dummy", Seq("dummy"))
  val beatBytes = 1 << testConfig.dataBusWidth // 256 bit bus
  val l2Nodes = Seq.tabulate(5) { _ =>
-    TLManagerNode(Seq(TLSlavePortParameters.v1(Seq(TLManagerParameters(
-      address = Seq(AddressSet(0x0, 0xffffff)), // should be matching cpuNode
-      resources = device.reg,
-      regionType = RegionType.UNCACHED,
-      executable = true,
-      supportsArithmetic = TransferSizes(1, beatBytes),
-      supportsLogical = TransferSizes(1, beatBytes),
-      supportsGet = TransferSizes(1, beatBytes),
-      supportsPutFull = TransferSizes(1, beatBytes),
-      supportsPutPartial = TransferSizes(1, beatBytes),
-      supportsHint = TransferSizes(1, beatBytes),
-      fifoId = Some(0))), beatBytes)))
+    TLManagerNode(
+      Seq(
+        TLSlavePortParameters.v1(
+          Seq(
+            TLManagerParameters(
+              address = Seq(AddressSet(0x0, 0xffffff)), // should be matching cpuNode
+              resources = device.reg,
+              regionType = RegionType.UNCACHED,
+              executable = true,
+              supportsArithmetic = TransferSizes(1, beatBytes),
+              supportsLogical = TransferSizes(1, beatBytes),
+              supportsGet = TransferSizes(1, beatBytes),
+              supportsPutFull = TransferSizes(1, beatBytes),
+              supportsPutPartial = TransferSizes(1, beatBytes),
+              supportsHint = TransferSizes(1, beatBytes),
+              fifoId = Some(0)
+            )
+          ),
+          beatBytes
+        )
+      )
+    )
  }

  val dut = LazyModule(new CoalescingUnit(testConfig))
@@ -80,83 +99,85 @@ class DummyCoalescingUnitTBImp(outer: DummyCoalescingUnitTB) extends LazyModuleI
 class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
  behavior of "multi- and mono-coalescers"

+  implicit val p: Parameters = Parameters.empty
+
+  val tb = LazyModule(new DummyCoalescingUnitTB())
+  // val outer = LazyModule(new CoalescingUnit(testConfig))
+
+  val coal = tb.dut
+  tb.cpuNodes.foreach(coal.node := _)
+  tb.l2Nodes.foreach(_ := coal.node)
+
+  def pokeA(
+      nodes: Seq[TLBundle],
+      idx: Int,
+      op: Int,
+      size: Int,
+      source: Int,
+      addr: Int,
+      mask: Int,
+      data: Int
+  ): Unit = {
+    val node = nodes(idx)
+//        node.a.ready.expect(true.B) // FIXME: this fails currently
+    node.a.bits.opcode.poke(if (op == 1) TLMessages.PutFullData else TLMessages.Get)
+    node.a.bits.param.poke(0.U)
+    node.a.bits.size.poke(size.U)
+    node.a.bits.source.poke(source.U)
+    node.a.bits.address.poke(addr.U)
+    node.a.bits.mask.poke(mask.U)
+    node.a.bits.data.poke(data.U)
+    node.a.bits.corrupt.poke(false.B)
+    node.a.valid.poke(true.B)
+  }
+
+  def unsetA(nodes: Seq[TLBundle]): Unit = {
+    nodes.foreach { node =>
+      node.a.valid.poke(false.B)
+    }
+  }
+
  it should "coalesce fully consecutive accesses at size 4, only once" in {
-    implicit val p: Parameters = Parameters.empty
-
-    val tb = LazyModule(new DummyCoalescingUnitTB())
-//    val outer = LazyModule(new CoalescingUnit(testConfig))
-
-    val coal = tb.dut
-    tb.cpuNodes.foreach(coal.node := _)
-    tb.l2Nodes.foreach(_ := coal.node)
-
-    test(tb.module).withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c =>
+    test(tb.module)
+    // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation))
+    { c =>
+      println(s"coalIO length = ${c.coalIOs(0).length}")
      val nodes = c.coalIOs.map(_.head)
 //      val nodes = c.cpuNodesImp.map(_.out.head._1)
 //      val nodes = c.coal.node.in.map(_._1)
 //      val nodes = c.mitmNodesImp.map(_.in.head._1)

-      def pokeA(nodes: Seq[TLBundle], idx: Int, op: Int, size: Int, source: Int, addr: Int, mask: Int, data: Int): Unit = {
-        val node = nodes(idx)
-//        node.a.ready.expect(true.B) // FIXME: this fails currently
-        node.a.bits.opcode.poke(if (op == 1) TLMessages.PutFullData else TLMessages.Get)
-        node.a.bits.param.poke(0.U)
-        node.a.bits.size.poke(size.U)
-        node.a.bits.source.poke(source.U)
-        node.a.bits.address.poke(addr.U)
-        node.a.bits.mask.poke(mask.U)
-        node.a.bits.data.poke(data.U)
-        node.a.bits.corrupt.poke(false.B)
-        node.a.valid.poke(true.B)
-      }
-
-      def unsetA(): Unit = {
-        nodes.foreach { node =>
-          node.a.valid.poke(false.B)
-        }
-      }
-
      // always ready to take coalesced requests
 //      c.coalMasterNode.head.a.ready.poke(true.B)
 //      c.coal.module.coalescer.io.outReq.ready.poke(true.B)

-      pokeA(nodes, idx=0, op=1, size=2, source=0, addr=0x10, mask=0xf, data=0x1111)
-      pokeA(nodes, idx=1, op=1, size=2, source=0, addr=0x14, mask=0xf, data=0x2222)
-      pokeA(nodes, idx=2, op=1, size=2, source=0, addr=0x18, mask=0xf, data=0x3333)
-      pokeA(nodes, idx=3, op=1, size=2, source=0, addr=0x1c, mask=0xf, data=0x4444)
+      pokeA(nodes, idx = 0, op = 1, size = 2, source = 0, addr = 0x10, mask = 0xf, data = 0x1111)
+      pokeA(nodes, idx = 1, op = 1, size = 2, source = 0, addr = 0x14, mask = 0xf, data = 0x2222)
+      pokeA(nodes, idx = 2, op = 1, size = 2, source = 0, addr = 0x18, mask = 0xf, data = 0x3333)
+      pokeA(nodes, idx = 3, op = 1, size = 2, source = 0, addr = 0x1c, mask = 0xf, data = 0x4444)

      c.clock.step()

-      unsetA()
+      unsetA(nodes)

      c.clock.step()
      c.clock.step()
    }
  }

-  it should "coalesce strided accesses at size 6" in {
+  it should "coalesce identical addresses (stride of 0)" in {}

-  }
+  it should "coalesce strided accesses at size 6" in {}

-  it should "coalesce the coalescable chunk and leave 2 uncoalescable requests" in {
+  it should "coalesce the coalescable chunk and leave 2 uncoalescable requests" in {}

-  }
+  it should "not touch uncoalescable requests" in {}

-  it should "not touch uncoalescable requests" in {
+  it should "allow temporal coalescing when depth >=2" in {}

-  }
+  it should "select the most coverage mono-coalescer" in {}

-  it should "allow temporal coalescing when depth >=2" in {
-
-  }
-
-  it should "select the most coverage mono-coalescer" in {
-
-  }
-
-  it should "resort to the backup policy when coverage is below average" in {
-
-  }
+  it should "resort to the backup policy when coverage is below average" in {}
 }

 class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
@@ -381,22 +402,23 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester {
  }
 }

-object testConfig extends CoalescerConfig(
-  maxSize = 5,
-  queueDepth = 2,
-  waitTimeout = 8,
-  addressWidth = 24,
-  dataBusWidth = 5,
-  numLanes = 4,
-  // watermark = 2,
-  wordSizeInBytes = 4,
-  wordWidth = 2,
-  numOldSrcIds = 16,
-  numNewSrcIds = 4,
-  respQueueDepth = 4,
-  coalSizes = Seq(4, 5),
-  sizeEnum = DefaultInFlightTableSizeEnum
-)
+object testConfig
+    extends CoalescerConfig(
+      maxSize = 5,
+      queueDepth = 2,
+      waitTimeout = 8,
+      addressWidth = 24,
+      dataBusWidth = 5,
+      numLanes = 4,
+      // watermark = 2,
+      wordSizeInBytes = 4,
+      wordWidth = 2,
+      numOldSrcIds = 16,
+      numNewSrcIds = 4,
+      respQueueDepth = 4,
+      coalLogSizes = Seq(4, 5),
+      sizeEnum = DefaultInFlightTableSizeEnum
+    )

 class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester {
  behavior of "uncoalescer"