From b3a92071364b8948f65c076dd8f6b6667eb74434 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 4 Mar 2024 13:45:54 -0800 Subject: [PATCH] Tie tile smem ports together using Xbars; comment-out Gemmini spad TODO pull Gemmini out to the cluster as well --- .../scala/radiance/tile/RadianceCluster.scala | 46 +++++-- .../scala/radiance/tile/RadianceTile.scala | 114 ++++++++---------- 2 files changed, 83 insertions(+), 77 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 0931e37..69499ec 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -34,26 +34,52 @@ class RadianceCluster ( clbus.clockGroupNode := allClockGroupsNode - val numLsuLanes = 4 + // Instantiate cluster-local shared memory scratchpad + // + // Instantiate the same number of banks as there are lanes. + val numLsuLanes = 4 // FIXME: hardcoded val wordSize = 4 val smemBanks = Seq.tabulate(numLsuLanes) { bankId => // Banked-by-word (4 bytes) // base for bank 1: ff...000000|01|00 // mask for bank 1; 00...111111|00|11 - // val base = 0xff000000L | (bankId * 4 /*wordSize*/ ) - // val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ ) val base = 0xff000000L | (bankId * wordSize) - val mask = 0x00ffffffL ^ ((numLsuLanes - 1) * wordSize) + val mask = 0x00001fffL ^ ((numLsuLanes - 1) * wordSize) LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = wordSize)) } smemBanks.foreach(_.node := clbus.outwardNode) - println(s"===== Cluster: nTotalTiles = ${nTotalTiles}") - println(s"===== Cluster: nLeafTiles = ${nLeafTiles}") + // HACK: This is a work around the normal bus connecting API by downcasting + // tile and directly accessing the node inside that is not exposed as a + // master in HierarchicalElementCrossingParamsLike. + val tile = leafTiles(0).asInstanceOf[RadianceTile] + val perSmemPortXbars = Seq.fill(tile.smemNodes.size) { LazyModule(new TLXbar) } - leafTiles.map { case (id, tile: RadianceTile) => - println(s"======= RadianceCluster: connecting cluster ${id} to clbus") - clbus.inwardNode :=* tile.smemXbar.node - // clbus.inwardNode :=* tile.smemNodes(0) + // Tie corresponding smem ports from every tile into a single port using + // Xbars so that the number of ports going into the sharedmem do not scale + // with the number of tiles. + leafTiles.foreach { case (id, tile: RadianceTile) => + (perSmemPortXbars zip tile.smemNodes).foreach { + case (xbar, node) => xbar.node := node + } + // tile.smemNodes.foreach (clbus.inwardNode := _) + } + perSmemPortXbars.foreach { clbus.inwardNode := _.node } + + override lazy val module = new RadianceClusterModuleImp(this) +} + +class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp(outer) { + outer.leafTiles.foreach { case (id, tile: RadianceTile) => + // println(s"======= RadianceCluster: tile.smemXbar.node.edge = ${tile.smemXbar.node.out.size}") + println(s"======= RadianceCluster: clbus inward edges = ${outer.clbus.inwardNode.inward.inputs.size}") + println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}") + } + + outer.perSmemPortXbars(0).node.out(0)._2.slave.slaves(0).address.foreach { addrSet => + println(s"====== perSmemPortXbars(0).slaves(0).addr: ${addrSet.toString()}") + } + outer.perSmemPortXbars(0).node.out(0)._2.master.masters(0).visibility.foreach { addrSet => + println(s"====== perSmemPortXbars(0).masters(0).addr: ${addrSet.toString()}") } } diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 4b6a9be..570efb8 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -143,8 +143,8 @@ class RadianceTile private ( "SIMTCoreKey not defined; make sure to use WithSimtConfig when using RadianceTile" ) - // NOTE: when changing these, remember to change +define+NUM_THREADS/WARPS in - // EXTRA_SIM_PREPROC_DEFINES as well! + // NOTE: when changing these, remember to change +define+NUM_CORES/THREADS/WARPS in + // radiance.mk as well! val numWarps = p(SIMTCoreKey) match { case Some(simtParam) => simtParam.nWarps case None => 4 @@ -303,9 +303,6 @@ class RadianceTile private ( // Conditionally instantiate L1 cache val (icacheNode, dcacheNode): (TLNode, TLNode) = p(VortexL1Key) match { case Some(vortexL1Config) => { - println( - s"============ Using Vortex L1 cache =================" - ) // require( // p(CoalescerKey).isDefined, // "Vortex L1 configuration currently only works when coalescer is also enabled." @@ -328,23 +325,6 @@ class RadianceTile private ( } } - // Instantiate sharedmem banks - // - // Instantiate the same number of banks as there are lanes. - // TODO: parametrize - // val smemBanks = Seq.tabulate(numLsuLanes) { bankId => - // // Banked-by-word (4 bytes) - // // base for bank 1: ff...000000|01|00 - // // mask for bank 1; 00...111111|00|11 - // val base = 0xff000000L | (bankId * 4 /*wordSize*/ ) - // val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ ) - // LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = 4 /*wordSize*/ )) - // } - // smem lanes-to-banks crossbar - val smemXbar = LazyModule(new TLXbar) - smemNodes.foreach(smemXbar.node := _) - // smemBanks.foreach(_.node := smemXbar.node) - val base = p(GPUMemory()) match { case Some(GPUMemParams(baseAddr, _)) => baseAddr case _ => BigInt(0) @@ -361,32 +341,32 @@ class RadianceTile private ( // ROCC // TODO: parametrize - val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy( - has_training_convs = false, - has_max_pool = false, - use_tl_ext_mem = true, - tl_ext_mem_base = x"ff000000", - sp_singleported = false, - spad_read_delay = 8, - use_shared_ext_mem = true, - acc_sub_banks = 1, - has_normalizations = false, - sp_capacity = CapacityInKilobytes(16), - acc_capacity = CapacityInKilobytes(8), - ))) - val roccs: Seq[LazyRoCC] = Seq(gemmini) - tlMasterXbar.node :=* AddressOrNode(base) :=* gemmini.atlNode - tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode + // val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy( + // has_training_convs = false, + // has_max_pool = false, + // use_tl_ext_mem = true, + // tl_ext_mem_base = x"ff000000", + // sp_singleported = false, + // spad_read_delay = 8, + // use_shared_ext_mem = true, + // acc_sub_banks = 1, + // has_normalizations = false, + // sp_capacity = CapacityInKilobytes(16), + // acc_capacity = CapacityInKilobytes(8), + // ))) + // val roccs: Seq[LazyRoCC] = Seq(gemmini) + // tlMasterXbar.node :=* AddressOrNode(base) :=* gemmini.atlNode + // tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode // MMIO - gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node + // gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node // sharedmem access // // FIXME: gemmini spad has 16B data width; core smem interface has 4B. Need // to consolidate by either coalescing, or changing gemmini spad to // strided-by-word - gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node - TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node + // gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node + // TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node /* below are copied from rocket */ @@ -700,33 +680,33 @@ class RadianceTileModuleImp(outer: RadianceTile) // TODO: generalize for useVxCache if (!outer.radianceParams.useVxCache) {} - // RoCC - if (outer.roccs.size > 0) { - val (respArb, cmdRouter) = { - val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size)) - val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p)) - outer.roccs.zipWithIndex.foreach { case (rocc, i) => - // ptwPorts ++= rocc.module.io.ptw - rocc.module.io.ptw <> DontCare - rocc.module.io.mem <> DontCare - rocc.module.io.cmd <> cmdRouter.io.out(i) - respArb.io.in(i) <> Queue(rocc.module.io.resp) - } - // Create this FPU just for RoCC - // val nFPUPorts = outer.roccs.filter(_.usesFPU).size - val fp_rocc_ios = outer.roccs.map(_.module.io) - fp_rocc_ios.map { io => - io.fpu_req.ready := false.B - io.fpu_resp.valid := false.B - io.fpu_resp.bits := DontCare - } - (respArb, cmdRouter) - } + // // RoCC + // if (outer.roccs.size > 0) { + // val (respArb, cmdRouter) = { + // val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size)) + // val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p)) + // outer.roccs.zipWithIndex.foreach { case (rocc, i) => + // // ptwPorts ++= rocc.module.io.ptw + // rocc.module.io.ptw <> DontCare + // rocc.module.io.mem <> DontCare + // rocc.module.io.cmd <> cmdRouter.io.out(i) + // respArb.io.in(i) <> Queue(rocc.module.io.resp) + // } + // // Create this FPU just for RoCC + // // val nFPUPorts = outer.roccs.filter(_.usesFPU).size + // val fp_rocc_ios = outer.roccs.map(_.module.io) + // fp_rocc_ios.map { io => + // io.fpu_req.ready := false.B + // io.fpu_resp.valid := false.B + // io.fpu_resp.bits := DontCare + // } + // (respArb, cmdRouter) + // } - cmdRouter.io.in <> DontCare - outer.roccs.foreach(_.module.io.exception := DontCare) - respArb.io.out <> DontCare - } + // cmdRouter.io.in <> DontCare + // outer.roccs.foreach(_.module.io.exception := DontCare) + // respArb.io.out <> DontCare + // } } // Some @copypaste from CoalescerSourceGen.