Tie tile smem ports together using Xbars; comment-out Gemmini spad

TODO pull Gemmini out to the cluster as well
2024-03-04 13:45:54 -08:00
parent 805651a11b
commit b3a9207136
2 changed files with 83 additions and 77 deletions
--- a/src/main/scala/radiance/tile/RadianceCluster.scala
+++ b/src/main/scala/radiance/tile/RadianceCluster.scala
@@ -34,26 +34,52 @@ class RadianceCluster (

  clbus.clockGroupNode := allClockGroupsNode

-  val numLsuLanes = 4
+  // Instantiate cluster-local shared memory scratchpad
+  //
+  // Instantiate the same number of banks as there are lanes.
+  val numLsuLanes = 4 // FIXME: hardcoded
  val wordSize = 4
  val smemBanks = Seq.tabulate(numLsuLanes) { bankId =>
    // Banked-by-word (4 bytes)
    // base for bank 1: ff...000000|01|00
    // mask for bank 1; 00...111111|00|11
-    // val base = 0xff000000L | (bankId * 4 /*wordSize*/ )
-    // val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ )
    val base = 0xff000000L | (bankId * wordSize)
-    val mask = 0x00ffffffL ^ ((numLsuLanes - 1) * wordSize)
+    val mask = 0x00001fffL ^ ((numLsuLanes - 1) * wordSize)
    LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = wordSize))
  }
  smemBanks.foreach(_.node := clbus.outwardNode)

-  println(s"===== Cluster: nTotalTiles = ${nTotalTiles}")
-  println(s"===== Cluster: nLeafTiles = ${nLeafTiles}")
+  // HACK: This is a work around the normal bus connecting API by downcasting
+  // tile and directly accessing the node inside that is not exposed as a
+  // master in HierarchicalElementCrossingParamsLike.
+  val tile = leafTiles(0).asInstanceOf[RadianceTile]
+  val perSmemPortXbars = Seq.fill(tile.smemNodes.size) { LazyModule(new TLXbar) }

-  leafTiles.map { case (id, tile: RadianceTile) =>
-    println(s"======= RadianceCluster: connecting cluster ${id} to clbus")
-    clbus.inwardNode :=* tile.smemXbar.node
-    // clbus.inwardNode :=* tile.smemNodes(0)
+  // Tie corresponding smem ports from every tile into a single port using
+  // Xbars so that the number of ports going into the sharedmem do not scale
+  // with the number of tiles.
+  leafTiles.foreach { case (id, tile: RadianceTile) =>
+    (perSmemPortXbars zip tile.smemNodes).foreach {
+      case (xbar, node) => xbar.node := node
+    }
+    // tile.smemNodes.foreach (clbus.inwardNode := _)
+  }
+  perSmemPortXbars.foreach { clbus.inwardNode := _.node }
+
+  override lazy val module = new RadianceClusterModuleImp(this)
+}
+
+class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp(outer) {
+  outer.leafTiles.foreach { case (id, tile: RadianceTile) =>
+    // println(s"======= RadianceCluster: tile.smemXbar.node.edge = ${tile.smemXbar.node.out.size}")
+    println(s"======= RadianceCluster: clbus inward edges = ${outer.clbus.inwardNode.inward.inputs.size}")
+    println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}")
+  }
+
+  outer.perSmemPortXbars(0).node.out(0)._2.slave.slaves(0).address.foreach { addrSet =>
+    println(s"====== perSmemPortXbars(0).slaves(0).addr: ${addrSet.toString()}")
+  }
+  outer.perSmemPortXbars(0).node.out(0)._2.master.masters(0).visibility.foreach { addrSet =>
+    println(s"====== perSmemPortXbars(0).masters(0).addr: ${addrSet.toString()}")
  }
 }
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -143,8 +143,8 @@ class RadianceTile private (
    "SIMTCoreKey not defined; make sure to use WithSimtConfig when using RadianceTile"
  )

-  // NOTE: when changing these, remember to change +define+NUM_THREADS/WARPS in
-  // EXTRA_SIM_PREPROC_DEFINES as well!
+  // NOTE: when changing these, remember to change +define+NUM_CORES/THREADS/WARPS in
+  // radiance.mk as well!
  val numWarps = p(SIMTCoreKey) match {
    case Some(simtParam) => simtParam.nWarps
    case None            => 4
@@ -303,9 +303,6 @@ class RadianceTile private (
  // Conditionally instantiate L1 cache
  val (icacheNode, dcacheNode): (TLNode, TLNode) = p(VortexL1Key) match {
    case Some(vortexL1Config) => {
-      println(
-        s"============ Using Vortex L1 cache ================="
-      )
      // require(
      //   p(CoalescerKey).isDefined,
      //   "Vortex L1 configuration currently only works when coalescer is also enabled."
@@ -328,23 +325,6 @@ class RadianceTile private (
    }
  }

-  // Instantiate sharedmem banks
-  //
-  // Instantiate the same number of banks as there are lanes.
-  // TODO: parametrize
-  // val smemBanks = Seq.tabulate(numLsuLanes) { bankId =>
-  //   // Banked-by-word (4 bytes)
-  //   // base for bank 1: ff...000000|01|00
-  //   // mask for bank 1; 00...111111|00|11
-  //   val base = 0xff000000L | (bankId * 4 /*wordSize*/ )
-  //   val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ )
-  //   LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = 4 /*wordSize*/ ))
-  // }
-  // smem lanes-to-banks crossbar
-  val smemXbar = LazyModule(new TLXbar)
-  smemNodes.foreach(smemXbar.node := _)
-  // smemBanks.foreach(_.node := smemXbar.node)
-
  val base = p(GPUMemory()) match {
    case Some(GPUMemParams(baseAddr, _)) => baseAddr
    case _ => BigInt(0)
@@ -361,32 +341,32 @@ class RadianceTile private (

  // ROCC
  // TODO: parametrize
-  val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy(
-    has_training_convs = false,
-    has_max_pool = false,
-    use_tl_ext_mem = true,
-    tl_ext_mem_base = x"ff000000",
-    sp_singleported = false,
-    spad_read_delay = 8,
-    use_shared_ext_mem = true,
-    acc_sub_banks = 1,
-    has_normalizations = false,
-    sp_capacity = CapacityInKilobytes(16),
-    acc_capacity = CapacityInKilobytes(8),
-  )))
-  val roccs: Seq[LazyRoCC] = Seq(gemmini)
-  tlMasterXbar.node :=* AddressOrNode(base) :=* gemmini.atlNode
-  tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode
+  // val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy(
+  //   has_training_convs = false,
+  //   has_max_pool = false,
+  //   use_tl_ext_mem = true,
+  //   tl_ext_mem_base = x"ff000000",
+  //   sp_singleported = false,
+  //   spad_read_delay = 8,
+  //   use_shared_ext_mem = true,
+  //   acc_sub_banks = 1,
+  //   has_normalizations = false,
+  //   sp_capacity = CapacityInKilobytes(16),
+  //   acc_capacity = CapacityInKilobytes(8),
+  // )))
+  // val roccs: Seq[LazyRoCC] = Seq(gemmini)
+  // tlMasterXbar.node :=* AddressOrNode(base) :=* gemmini.atlNode
+  // tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode

  // MMIO
-  gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node
+  // gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node
  // sharedmem access
  //
  // FIXME: gemmini spad has 16B data width; core smem interface has 4B. Need
  // to consolidate by either coalescing, or changing gemmini spad to
  // strided-by-word
-  gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node
-  TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node
+  // gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node
+  // TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node

  /* below are copied from rocket */

@@ -700,33 +680,33 @@ class RadianceTileModuleImp(outer: RadianceTile)
  // TODO: generalize for useVxCache
  if (!outer.radianceParams.useVxCache) {}

-  // RoCC
-  if (outer.roccs.size > 0) {
-    val (respArb, cmdRouter) = {
-      val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size))
-      val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p))
-      outer.roccs.zipWithIndex.foreach { case (rocc, i) =>
-        // ptwPorts ++= rocc.module.io.ptw
-        rocc.module.io.ptw <> DontCare
-        rocc.module.io.mem <> DontCare
-        rocc.module.io.cmd <> cmdRouter.io.out(i)
-        respArb.io.in(i) <> Queue(rocc.module.io.resp)
-      }
-      // Create this FPU just for RoCC
-      // val nFPUPorts = outer.roccs.filter(_.usesFPU).size
-      val fp_rocc_ios = outer.roccs.map(_.module.io)
-      fp_rocc_ios.map { io =>
-        io.fpu_req.ready := false.B
-        io.fpu_resp.valid := false.B
-        io.fpu_resp.bits := DontCare
-      }
-      (respArb, cmdRouter)
-    }
+  // // RoCC
+  // if (outer.roccs.size > 0) {
+  //   val (respArb, cmdRouter) = {
+  //     val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size))
+  //     val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p))
+  //     outer.roccs.zipWithIndex.foreach { case (rocc, i) =>
+  //       // ptwPorts ++= rocc.module.io.ptw
+  //       rocc.module.io.ptw <> DontCare
+  //       rocc.module.io.mem <> DontCare
+  //       rocc.module.io.cmd <> cmdRouter.io.out(i)
+  //       respArb.io.in(i) <> Queue(rocc.module.io.resp)
+  //     }
+  //     // Create this FPU just for RoCC
+  //     // val nFPUPorts = outer.roccs.filter(_.usesFPU).size
+  //     val fp_rocc_ios = outer.roccs.map(_.module.io)
+  //     fp_rocc_ios.map { io =>
+  //       io.fpu_req.ready := false.B
+  //       io.fpu_resp.valid := false.B
+  //       io.fpu_resp.bits := DontCare
+  //     }
+  //     (respArb, cmdRouter)
+  //   }

-    cmdRouter.io.in <> DontCare
-    outer.roccs.foreach(_.module.io.exception := DontCare)
-    respArb.io.out <> DontCare
-  }
+  //   cmdRouter.io.in <> DontCare
+  //   outer.roccs.foreach(_.module.io.exception := DontCare)
+  //   respArb.io.out <> DontCare
+  // }
 }

 // Some @copypaste from CoalescerSourceGen.