From b5074f55176d8a583c62d588f600e309f3ccc1b4 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 17 Mar 2024 14:12:10 -0700 Subject: [PATCH] Connect core gbar signals to cluster via Diplomacy --- .../scala/radiance/tile/RadianceCluster.scala | 71 ++++++++++++++++++- .../scala/radiance/tile/RadianceTile.scala | 38 +++++++++- src/main/scala/radiance/tile/VortexCore.scala | 30 ++++---- 3 files changed, 123 insertions(+), 16 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index b0c175e..2d67a24 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -4,12 +4,13 @@ package radiance.tile import chisel3._ +import chisel3.experimental.SourceInfo import chisel3.util._ import org.chipsalliance.cde.config.{Field, Parameters} import freechips.rocketchip.subsystem._ import freechips.rocketchip.tilelink._ -import freechips.rocketchip.diplomacy.{LazyModule, AddressSet, SimpleDevice, ClockCrossingType} +import freechips.rocketchip.diplomacy._ import freechips.rocketchip.regmapper.RegField import freechips.rocketchip.prci.ClockSinkParameters @@ -50,6 +51,11 @@ class RadianceCluster ( } smemBanks.foreach(_.node := clbus.outwardNode) + val numCores = leafTiles.size + + // Diplomacy sink nodes for cluster-wide barrier sync signal + val barrierSlaveNode = BarrierSlaveNode(numCores) + // HACK: This is a workaround of the CanAttachTile bus connecting API that // works by downcasting tile and directly accessing the node inside that is // not exposed as a master in HierarchicalElementCrossingParamsLike. @@ -63,11 +69,12 @@ class RadianceCluster ( // (perSmemPortXbars zip tile.smemNodes).foreach { // case (xbar, node) => xbar.node := node // } - tile.smemNodes.foreach (clbus.inwardNode := _) + tile.smemNodes.foreach(clbus.inwardNode := _) + barrierSlaveNode := tile.barrierMasterNode } // perSmemPortXbars.foreach { clbus.inwardNode := _.node } - // Memory-mapped register for barrier synchronization + // Memory-mapped register for barrier sync val regDevice = new SimpleDevice("radiance-cluster-barrier-reg", Seq(s"radiance-cluster-barrier-reg${clusterId}")) val regNode = TLRegisterNode( @@ -77,6 +84,10 @@ class RadianceCluster ( concurrency = 1) regNode := clbus.outwardNode + nodes.foreach({ node => + println(s"======= RadianceCluster node.name: ${node.name}") + }) + override lazy val module = new RadianceClusterModuleImp(this) } @@ -87,6 +98,16 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}") } + outer.barrierSlaveNode.in.foreach { case (b, e) => + b.req.ready := true.B // barrier module is always ready + b.resp.valid := 0.U + b.resp.bits.barrierId := 0.U + } + + auto.elements.foreach({case (name, _) => + println(s"======= RadianceCluster.elements.name: ${name}") + }) + val numCores = outer.leafTiles.size val numBarriers = 4 // FIXME: hardcoded val allSyncedRegs = Seq.fill(numBarriers)(Wire(UInt(32.W))) @@ -116,4 +137,48 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( 0x34 -> Seq(RegField(32, perCoreSyncedRegs(3)(0))), 0x38 -> Seq(RegField(32, perCoreSyncedRegs(3)(1))), ) + + println(s"======== barrierSlaveNode: ${outer.barrierSlaveNode.in(0)._2.barrierIdBits}") } + +case class EmptyParams() + +case class BarrierParams( + barrierIdBits: Int, + numCoreBits: Int +) + +class BarrierRequestBits( + param: BarrierParams +) extends Bundle { + val barrierId = UInt(param.barrierIdBits.W) + val sizeMinusOne = UInt(param.numCoreBits.W) + val coreId = UInt(param.numCoreBits.W) +} + +class BarrierResponseBits( + param: BarrierParams +) extends Bundle { + val barrierId = UInt(param.barrierIdBits.W) +} + +class BarrierBundle(param: BarrierParams) extends Bundle { + val req = Decoupled(new BarrierRequestBits(param)) + val resp = Flipped(Decoupled(new BarrierResponseBits(param))) +} + +// FIXME Separate BarrierEdgeParams from BarrierParams +object BarrierNodeImp extends SimpleNodeImp[BarrierParams, EmptyParams, BarrierParams, BarrierBundle] { + def edge(pd: BarrierParams, pu: EmptyParams, p: Parameters, sourceInfo: SourceInfo) = { + // barrier parameters flow strictly downward from the master node + pd + } + def bundle(e: BarrierParams) = new BarrierBundle(e) + // FIXME render + def render(e: BarrierParams) = RenderedEdge(colour = "ffffff", label = "X") +} + +case class BarrierMasterNode(val srcParams: BarrierParams)(implicit valName: ValName) + extends SourceNode(BarrierNodeImp)(Seq(srcParams)) +case class BarrierSlaveNode(val numEdges: Int)(implicit valName: ValName) + extends SinkNode(BarrierNodeImp)(Seq.fill(numEdges)(EmptyParams())) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 9f8424b..c9fe609 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -326,6 +326,11 @@ class RadianceTile private ( } } + // Barrier synchronization node + // FIXME: hardcoded params + val barrierParams = BarrierParams(barrierIdBits = 2, numCoreBits = 1) + val barrierMasterNode = BarrierMasterNode(barrierParams) + val base = p(GPUMemory()) match { case Some(GPUMemParams(baseAddr, _)) => baseAddr case _ => BigInt(0) @@ -339,7 +344,6 @@ class RadianceTile private ( tlMasterXbar.node :=* AddressOrNode(base) :=* dcacheNode } - // ROCC // TODO: parametrize // val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy( @@ -451,6 +455,10 @@ class RadianceTileModuleImp(outer: RadianceTile) extends BaseTileModuleImp(outer) { Annotated.params(this, outer.radianceParams) + auto.elements.foreach({case (name, _) => + println(s"======= RadianceTile.elements.name: ${name}") + }) + val core = Module(new Vortex(outer)(outer.p)) core.io.clock := clock @@ -686,6 +694,17 @@ class RadianceTileModuleImp(outer: RadianceTile) } } + def connectBarrier = { + require(outer.barrierMasterNode.out.length == 1) + // FIXME: bits not flattened + outer.barrierMasterNode.out(0)._1.req.valid := core.io.gbar_req_valid + outer.barrierMasterNode.out(0)._1.req.bits.barrierId := core.io.gbar_req_id + outer.barrierMasterNode.out(0)._1.req.bits.coreId := core.io.gbar_req_core_id + core.io.gbar_req_ready := outer.barrierMasterNode.out(0)._1.req.ready + core.io.gbar_rsp_valid := outer.barrierMasterNode.out(0)._1.resp.valid + core.io.gbar_rsp_id := outer.barrierMasterNode.out(0)._1.resp.bits.barrierId + } + def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]], respBundles: Seq[DecoupledIO[VortexBundleD]], desc: String) = { @@ -721,6 +740,7 @@ class RadianceTileModuleImp(outer: RadianceTile) connectImem connectDmem connectSmem + connectBarrier } // TODO: generalize for useVxCache @@ -755,6 +775,22 @@ class RadianceTileModuleImp(outer: RadianceTile) // } } +class ClusterSynchronizer( + barrierIdWidth: Int, + numCoreWidth: Int, +) extends Module { + val io = IO(new Bundle { + val req = Flipped(Decoupled(new Bundle { + val barrierId = UInt(barrierIdWidth.W) + val sizeMinusOne = UInt(numCoreWidth.W) + val coreId = UInt(numCoreWidth.W) + })) + val resp = Decoupled(new Bundle { + val barrierId = UInt(barrierIdWidth.W) + }) + }) +} + // Some @copypaste from CoalescerSourceGen. class VortexTLAdapter( newSourceWidth: Int, diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index be4c956..7a37c24 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -41,18 +41,11 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val interrupts = Input(new freechips.rocketchip.rocket.CoreInterrupts(false/*hasBeu*/)) // conditionally instantiate ports depending on whether we want to use VX_cache or not + // TODO: flatten this like dmem and smem val imem = if (!tile.radianceParams.useVxCache) Some(Vec(1, new Bundle { val a = Decoupled(new VortexBundleA(tagWidth = tile.imemTagWidth, dataWidth = 32)) val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.imemTagWidth, dataWidth = 32))) })) else None - val dmem = if (!tile.radianceParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle { - // val a = Decoupled(new VortexBundleA(tagWidth = tile.dmemTagWidth, dataWidth = 32)) - // val d = Flipped(Decoupled(new VortexBundleD(tagWidth = dmemTagWidth, dataWidth = 32))) - })) else None - val smem = if (!tile.radianceParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle { - // val a = Decoupled(new VortexBundleA(tagWidth = tile.smemTagWidth, dataWidth = 32)) - // val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.smemTagWidth, dataWidth = 32))) - })) else None val mem = if (tile.radianceParams.useVxCache) Some(new Bundle { val a = Decoupled(new VortexBundleA(tagWidth = 15, dataWidth = 128)) val d = Flipped(Decoupled(new VortexBundleD(tagWidth = 15, dataWidth = 128))) @@ -96,6 +89,18 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val smem_d_bits_data = Input(UInt((tile.numLsuLanes * 32).W)) val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W)) + // FIXME: hardcoded + val numCoresPerCluster = 2 + val NB_WIDTH = 2 + val NC_WIDTH = 1 + val gbar_req_valid = Output(UInt((numCoresPerCluster * 1).W)) + val gbar_req_id = Output(UInt((numCoresPerCluster * NB_WIDTH).W)) + val gbar_req_size_m1 = Output(UInt((numCoresPerCluster * NC_WIDTH).W)) + val gbar_req_core_id = Output(UInt((numCoresPerCluster * NC_WIDTH).W)) + val gbar_req_ready = Input(UInt((numCoresPerCluster * 1).W)) + val gbar_rsp_valid = Input(UInt((numCoresPerCluster * 1).W)) + val gbar_rsp_id = Input(UInt((numCoresPerCluster * NB_WIDTH).W)) + // val fpu = Flipped(new FPUCoreIO()) //val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs)) //val trace = Output(new TraceBundle) @@ -112,6 +117,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // see VX_csr_data that implements the read logic for CSR_MHARTID/GWID. Map( "CORE_ID" -> tile.tileParams.tileId, + "CORES_PER_CLUSTER" -> 2, // FIXME: hardcoded // TODO: can we get this as a parameter? "BOOTROM_HANG100" -> 0x10100, "NUM_THREADS" -> tile.numLsuLanes @@ -194,10 +200,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_tags.sv") // addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_wrap.sv") - // gbar is only used in the socket/cluster hierarchy - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv") - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_bus_if.sv") - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv") // mem_arb is used in VX_socket or VX_cache_cluster // addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_arb.sv") addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_bus_if.sv") @@ -220,6 +222,10 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // used when PERF_ENABLE is defined addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_perf_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_pipeline_perf_if.sv") + // used when GBAR_ENABLE is defined + addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_bus_if.sv") + // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv") + // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv")