From 55d00d25bbdb417a69d2a476828ca5ed907e5145 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 18:51:33 -0800 Subject: [PATCH 01/23] Parametrize numWarps / numCoreLanes / numMemLanes --- .../radiance/memory/CanHaveMemtraceCore.scala | 4 +-- .../scala/radiance/memory/Coalescing.scala | 15 +++++++---- src/main/scala/radiance/memory/UnitTest.scala | 10 +++---- .../scala/radiance/subsystem/Configs.scala | 11 +++++--- src/main/scala/radiance/tile/FuzzerTile.scala | 2 +- .../scala/radiance/tile/RadianceTile.scala | 27 +++++++++++++------ 6 files changed, 44 insertions(+), 25 deletions(-) diff --git a/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala b/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala index ae803d5..701a274 100644 --- a/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala +++ b/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala @@ -17,10 +17,10 @@ trait CanHaveMemtraceCore { this: BaseSubsystem => // Safe to use get as WithMemtraceCore requires WithNLanes to be defined val simtParam = p(SIMTCoreKey).get val config = DefaultCoalescerConfig.copy( - numLanes = simtParam.nLanes, + numLanes = simtParam.nMemLanes, numOldSrcIds = simtParam.nSrcIds ) - val numLanes = simtParam.nLanes + val numLanes = simtParam.nMemLanes val filename = param.tracefilename // Need to explicitly generate clock domain; see rocket-chip 8881ccd diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index cc5c40e..338c36c 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -12,8 +12,13 @@ import freechips.rocketchip.tilelink._ // TODO: find better place for these -// Note: numNewSrcId is not a part of CoreParam, because the SIMT core should be agnostic to how inflight coalesced request can be genertated -case class SIMTCoreParams(nLanes: Int = 4, nSrcIds: Int = 8) +case class SIMTCoreParams( + nWarps: Int = 4, // # of warps in the core + nCoreLanes: Int = 4, // # of SIMT threads in the core + nMemLanes: Int = 4, // # of memory lanes in the memory interface to the + // cache; relates to the LSU lanes + nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes +) case class MemtraceCoreParams( tracefilename: String = "undefined", traceHasSource: Boolean = false @@ -2325,7 +2330,7 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig) // A dummy harness around the coalescer for use in VLSI flow. // Should not instantiate any memtrace modules. class DummyCoalescer(implicit p: Parameters) extends LazyModule { - val numLanes = p(SIMTCoreKey).get.nLanes + val numLanes = p(SIMTCoreKey).get.nMemLanes val config = DefaultCoalescerConfig.copy(numLanes = numLanes) val driver = LazyModule(new DummyDriver(config)) @@ -2362,7 +2367,7 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) // tracedriver --> coalescer --> tracelogger --> tlram class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { - val numLanes = p(SIMTCoreKey).get.nLanes + val numLanes = p(SIMTCoreKey).get.nMemLanes val config = DefaultCoalescerConfig.copy(numLanes = numLanes) val driver = LazyModule(new MemTraceDriver(config, filename)) @@ -2454,7 +2459,7 @@ class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit // tracedriver --> coalescer --> tlram class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { - val numLanes = p(SIMTCoreKey).get.nLanes + val numLanes = p(SIMTCoreKey).get.nMemLanes val config = DefaultCoalescerConfig.copy(numLanes = numLanes) val filename = "vecadd.core1.thread4.trace" diff --git a/src/main/scala/radiance/memory/UnitTest.scala b/src/main/scala/radiance/memory/UnitTest.scala index 24ea69d..c070ef4 100644 --- a/src/main/scala/radiance/memory/UnitTest.scala +++ b/src/main/scala/radiance/memory/UnitTest.scala @@ -8,7 +8,7 @@ import freechips.rocketchip.subsystem.{BaseSubsystemConfig} import freechips.rocketchip.devices.tilelink._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.util._ -import radiance.subsystem.WithSimtLanes +import radiance.subsystem.WithSimtConfig import freechips.rocketchip.unittest._ //import rocket.VortexFatBankTest @@ -27,7 +27,7 @@ class WithCoalescingUnitTests extends Config((site, _, _) => { // Module(new TLRAMCoalescerLoggerTest(filename="sfilter.core1.thread4.trace", timeout=timeout)), // Module(new TLRAMCoalescerLoggerTest(filename="nearn.core1.thread4.trace", timeout=50000000 * site(TestDurationMultiplier))), // Module(new TLRAMCoalescerLoggerTest(filename="psort.core1.thread4.trace", timeout=timeout)), - // Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.trace", timeout=timeout)(new WithSimtLanes(32))), + // Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.trace", timeout=timeout)(new WithSimtConfig(32))), // Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.lane4.trace", timeout=timeout)), ) } }) @@ -48,12 +48,12 @@ class WithCoalescingUnitSynthesisDummy(nLanes: Int) extends Config((site, _, _) implicit val p = q val timeout = 50000 * site(TestDurationMultiplier) Seq( - Module(new DummyCoalescerTest(timeout=timeout)(new WithSimtLanes(nLanes=4))), + Module(new DummyCoalescerTest(timeout=timeout)(new WithSimtConfig(nMemLanes=4))), ) } }) -class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtLanes(nLanes=4) ++ new BaseSubsystemConfig) -//class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtLanes(nLanes=4) ++ new BaseSubsystemConfig) +class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nMemLanes=4) ++ new BaseSubsystemConfig) +//class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nLanes=4) ++ new BaseSubsystemConfig) // Dummy configs of various sizes for synthesis class CoalescingSynthesisDummyLane4Config extends Config(new WithCoalescingUnitSynthesisDummy(4) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig) diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 77b0711..daaa9f3 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -66,10 +66,13 @@ class WithFuzzerCores( }) // `nSrcIds`: number of source IDs for dmem requests on each SIMT lane -class WithSimtLanes(nLanes: Int, nSrcIds: Int = 8) extends Config((site, _, up) => { +class WithSimtConfig(nWarps: Int = 4, nCoreLanes: Int = 4, nMemLanes: Int = 4, nSrcIds: Int = 8) +extends Config((site, _, up) => { case SIMTCoreKey => { Some(up(SIMTCoreKey, site).getOrElse(SIMTCoreParams()).copy( - nLanes = nLanes, + nWarps = nWarps, + nCoreLanes = nCoreLanes, + nMemLanes = nMemLanes, nSrcIds = nSrcIds )) } @@ -105,7 +108,7 @@ class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => { class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config((site, _, up) => { case CoalescerKey => { val (nLanes, numOldSrcIds) = up(SIMTCoreKey, site) match { - case Some(param) => (param.nLanes, param.nSrcIds) + case Some(param) => (param.nMemLanes, param.nSrcIds) case None => (1,1) } @@ -182,4 +185,4 @@ class WithExtGPUMem(address: BigInt = BigInt("0x100000000", 16), }) }) case class GPUMemParams(address: BigInt = BigInt("0x100000000", 16), size: BigInt = 0x80000000) -case class GPUMemory() extends Field[Option[GPUMemParams]](None) \ No newline at end of file +case class GPUMemory() extends Field[Option[GPUMemParams]](None) diff --git a/src/main/scala/radiance/tile/FuzzerTile.scala b/src/main/scala/radiance/tile/FuzzerTile.scala index e76342e..c139744 100644 --- a/src/main/scala/radiance/tile/FuzzerTile.scala +++ b/src/main/scala/radiance/tile/FuzzerTile.scala @@ -60,7 +60,7 @@ class FuzzerTile private ( // val statusNode = BundleBridgeSource(() => new GroundTestStatus) val (numLanes, numSrcIds) = p(SIMTCoreKey) match { - case Some(param) => (param.nLanes, param.nSrcIds) + case Some(param) => (param.nMemLanes, param.nSrcIds) case None => { require(false, "fuzzer requires SIMTCoreKey to be defined") (0, 0) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index f069aaf..4b6a9be 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -140,10 +140,21 @@ class RadianceTile private ( require( p(SIMTCoreKey).isDefined, - "SIMTCoreKey not defined; make sure to use WithSimtLanes when using RadianceTile" + "SIMTCoreKey not defined; make sure to use WithSimtConfig when using RadianceTile" ) - val numLanes = p(SIMTCoreKey) match { - case Some(simtParam) => simtParam.nLanes + + // NOTE: when changing these, remember to change +define+NUM_THREADS/WARPS in + // EXTRA_SIM_PREPROC_DEFINES as well! + val numWarps = p(SIMTCoreKey) match { + case Some(simtParam) => simtParam.nWarps + case None => 4 + } + val numCoreLanes = p(SIMTCoreKey) match { + case Some(simtParam) => simtParam.nCoreLanes + case None => 4 + } + val numLsuLanes = p(SIMTCoreKey) match { + case Some(simtParam) => simtParam.nMemLanes case None => 4 } @@ -170,13 +181,14 @@ class RadianceTile private ( val smemSourceWidth = 4 // FIXME: hardcoded - val numWarps = 4 // TODO: parametrize + // Replicates some of the logic of how Vortex determines the tag width of + // memory requests so that Chisel and Verilog are in agreement on bitwidths. + // See VX_gpu_pkg.sv val NW_WIDTH = (if (numWarps == 1) 1 else log2Ceil(numWarps)) val UUID_WIDTH = 44 val imemTagWidth = UUID_WIDTH + NW_WIDTH - val numLsuLanes = 4 - // see VX_gpu_pkg.sv - val LSUQ_SIZE = 8 * (numLanes / numLsuLanes) + + val LSUQ_SIZE = 8 * (numCoreLanes / numLsuLanes) val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/ val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS // dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH @@ -764,7 +776,6 @@ class VortexTLAdapter( io.outReq.bits.corrupt := 0.U io.inReq.ready := io.outReq.ready // VortexBundleD <> TLBundleD - // Filtering out write requests is handled inside the wrapper Verilog io.inResp.valid := io.outResp.valid io.inResp.bits.opcode := io.outResp.bits.opcode io.inResp.bits.size := io.outResp.bits.size From c05897abfcf7f513c03dffbef4f8ebc0d69cb616 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 29 Feb 2024 17:37:54 -0800 Subject: [PATCH 02/23] Add rocket-chip cluster support in WithRadianceCores --- .../scala/radiance/subsystem/Configs.scala | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index daaa9f3..9bc8cfa 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -13,10 +13,12 @@ import radiance.memory._ class WithRadianceCores( n: Int, + location: HierarchicalLocation, + crossing: RocketCrossingParams, useVxCache: Boolean ) extends Config((site, _, up) => { - case TilesLocated(InSubsystem) => { - val prev = up(TilesLocated(InSubsystem), site) + case TilesLocated(`location`) => { + val prev = up(TilesLocated(`location`), site) val idOffset = prev.size val vortex = RadianceTileParams( core = VortexCoreParams(fpu = None), @@ -43,10 +45,19 @@ class WithRadianceCores( blockBytes = site(CacheBlockBytes)))) List.tabulate(n)(i => RadianceTileAttachParams( vortex.copy(tileId = i + idOffset), - RocketCrossingParams() + crossing )) ++ prev } -}) +}) { + def this(n: Int, location: HierarchicalLocation = InSubsystem, useVxCache: Boolean = false) = this(n, location, RocketCrossingParams( + master = HierarchicalElementMasterPortParams.locationDefault(location), + slave = HierarchicalElementSlavePortParams.locationDefault(location), + mmioBaseAddressPrefixWhere = location match { + case InSubsystem => CBUS + case InCluster(clusterId) => CCBUS(clusterId) + } + ), useVxCache) +} class WithFuzzerCores( n: Int, From 339f559b4357904b446fb2fa81278f13612b2717 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 29 Feb 2024 17:52:59 -0800 Subject: [PATCH 03/23] Update NUM_THREADS/WARPS in accordance to RadianceConfig --- radiance.mk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/radiance.mk b/radiance.mk index b2b61ed..1a37e22 100644 --- a/radiance.mk +++ b/radiance.mk @@ -16,7 +16,8 @@ EXTRA_SIM_PREPROC_DEFINES += \ +define+SV_DPI \ +define+GPR_RESET \ +define+LSU_DUP_DISABLE \ - +define+DBG_TRACE_CORE_PIPELINE_VCS + +define+DBG_TRACE_CORE_PIPELINE_VCS \ + +define+NUM_THREADS=8 +define+NUM_WARPS=8 # cargo handles building of Rust files all on its own, so make this a PHONY # target to run cargo unconditionally From 805651a11be20da512028592183d1de84e04160f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 2 Mar 2024 15:46:11 -0800 Subject: [PATCH 04/23] Add RadianceCluster --- .../scala/radiance/subsystem/Configs.scala | 19 ++++++ .../subsystem/RadianceSubsystem.scala | 7 +++ .../scala/radiance/tile/RadianceCluster.scala | 59 +++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 src/main/scala/radiance/tile/RadianceCluster.scala diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 9bc8cfa..95e6318 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -76,6 +76,25 @@ class WithFuzzerCores( } }) +class WithRadianceCluster( + clusterId: Int, + location: HierarchicalLocation = InSubsystem, + crossing: RocketCrossingParams = RocketCrossingParams() // TODO make this not rocket +) extends Config((site, here, up) => { + case ClustersLocated(`location`) => up(ClustersLocated(location)) :+ RadianceClusterAttachParams( + RadianceClusterParams(clusterId = clusterId), + crossing) + case TLNetworkTopologyLocated(InCluster(`clusterId`)) => List( + ClusterBusTopologyParams( + clusterId = clusterId, + csbus = site(SystemBusKey), + ccbus = site(ControlBusKey).copy(errorDevice = None), + coherence = site(ClusterBankedCoherenceKey(clusterId)) + ) + ) + case PossibleTileLocations => up(PossibleTileLocations) :+ InCluster(clusterId) +}) + // `nSrcIds`: number of source IDs for dmem requests on each SIMT lane class WithSimtConfig(nWarps: Int = 4, nCoreLanes: Int = 4, nMemLanes: Int = 4, nSrcIds: Int = 8) extends Config((site, _, up) => { diff --git a/src/main/scala/radiance/subsystem/RadianceSubsystem.scala b/src/main/scala/radiance/subsystem/RadianceSubsystem.scala index f9fb0bf..8979fb5 100644 --- a/src/main/scala/radiance/subsystem/RadianceSubsystem.scala +++ b/src/main/scala/radiance/subsystem/RadianceSubsystem.scala @@ -9,3 +9,10 @@ case class RadianceTileAttachParams( tileParams: RadianceTileParams, crossingParams: RocketCrossingParams ) extends CanAttachTile { type TileType = RadianceTile } + +case class RadianceClusterAttachParams ( + clusterParams: RadianceClusterParams, + crossingParams: HierarchicalElementCrossingParamsLike +) extends CanAttachCluster { + type ClusterType = RadianceCluster +} diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala new file mode 100644 index 0000000..0931e37 --- /dev/null +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -0,0 +1,59 @@ +// See LICENSE.SiFive for license details. +// See LICENSE.Berkeley for license details. + +package radiance.tile + +import chisel3._ +import chisel3.util._ + +import org.chipsalliance.cde.config.{Field, Parameters} +import freechips.rocketchip.subsystem._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.diplomacy.{LazyModule, AddressSet, ClockCrossingType} +import freechips.rocketchip.prci.ClockSinkParameters + +case class RadianceClusterParams( + val clusterId: Int, + val clockSinkParams: ClockSinkParameters = ClockSinkParameters() +) extends InstantiableClusterParams[RadianceCluster] { + val baseName = "radiance_cluster" + val uniqueName = s"${baseName}_$clusterId" + def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByClusterIdImpl)(implicit p: Parameters): RadianceCluster = { + new RadianceCluster(this, crossing.crossingType, lookup) + } +} + +class RadianceCluster ( + thisClusterParams: RadianceClusterParams, + crossing: ClockCrossingType, + lookup: LookupByClusterIdImpl +)(implicit p: Parameters) extends Cluster(thisClusterParams, crossing, lookup) { + // cluster-local bus, used for shared memory traffic that never leaves the + // confines of a cluster + val clbus = tlBusWrapperLocationMap(CLBUS(clusterId)) + + clbus.clockGroupNode := allClockGroupsNode + + val numLsuLanes = 4 + val wordSize = 4 + val smemBanks = Seq.tabulate(numLsuLanes) { bankId => + // Banked-by-word (4 bytes) + // base for bank 1: ff...000000|01|00 + // mask for bank 1; 00...111111|00|11 + // val base = 0xff000000L | (bankId * 4 /*wordSize*/ ) + // val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ ) + val base = 0xff000000L | (bankId * wordSize) + val mask = 0x00ffffffL ^ ((numLsuLanes - 1) * wordSize) + LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = wordSize)) + } + smemBanks.foreach(_.node := clbus.outwardNode) + + println(s"===== Cluster: nTotalTiles = ${nTotalTiles}") + println(s"===== Cluster: nLeafTiles = ${nLeafTiles}") + + leafTiles.map { case (id, tile: RadianceTile) => + println(s"======= RadianceCluster: connecting cluster ${id} to clbus") + clbus.inwardNode :=* tile.smemXbar.node + // clbus.inwardNode :=* tile.smemNodes(0) + } +} From b3a92071364b8948f65c076dd8f6b6667eb74434 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 4 Mar 2024 13:45:54 -0800 Subject: [PATCH 05/23] Tie tile smem ports together using Xbars; comment-out Gemmini spad TODO pull Gemmini out to the cluster as well --- .../scala/radiance/tile/RadianceCluster.scala | 46 +++++-- .../scala/radiance/tile/RadianceTile.scala | 114 ++++++++---------- 2 files changed, 83 insertions(+), 77 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 0931e37..69499ec 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -34,26 +34,52 @@ class RadianceCluster ( clbus.clockGroupNode := allClockGroupsNode - val numLsuLanes = 4 + // Instantiate cluster-local shared memory scratchpad + // + // Instantiate the same number of banks as there are lanes. + val numLsuLanes = 4 // FIXME: hardcoded val wordSize = 4 val smemBanks = Seq.tabulate(numLsuLanes) { bankId => // Banked-by-word (4 bytes) // base for bank 1: ff...000000|01|00 // mask for bank 1; 00...111111|00|11 - // val base = 0xff000000L | (bankId * 4 /*wordSize*/ ) - // val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ ) val base = 0xff000000L | (bankId * wordSize) - val mask = 0x00ffffffL ^ ((numLsuLanes - 1) * wordSize) + val mask = 0x00001fffL ^ ((numLsuLanes - 1) * wordSize) LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = wordSize)) } smemBanks.foreach(_.node := clbus.outwardNode) - println(s"===== Cluster: nTotalTiles = ${nTotalTiles}") - println(s"===== Cluster: nLeafTiles = ${nLeafTiles}") + // HACK: This is a work around the normal bus connecting API by downcasting + // tile and directly accessing the node inside that is not exposed as a + // master in HierarchicalElementCrossingParamsLike. + val tile = leafTiles(0).asInstanceOf[RadianceTile] + val perSmemPortXbars = Seq.fill(tile.smemNodes.size) { LazyModule(new TLXbar) } - leafTiles.map { case (id, tile: RadianceTile) => - println(s"======= RadianceCluster: connecting cluster ${id} to clbus") - clbus.inwardNode :=* tile.smemXbar.node - // clbus.inwardNode :=* tile.smemNodes(0) + // Tie corresponding smem ports from every tile into a single port using + // Xbars so that the number of ports going into the sharedmem do not scale + // with the number of tiles. + leafTiles.foreach { case (id, tile: RadianceTile) => + (perSmemPortXbars zip tile.smemNodes).foreach { + case (xbar, node) => xbar.node := node + } + // tile.smemNodes.foreach (clbus.inwardNode := _) + } + perSmemPortXbars.foreach { clbus.inwardNode := _.node } + + override lazy val module = new RadianceClusterModuleImp(this) +} + +class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp(outer) { + outer.leafTiles.foreach { case (id, tile: RadianceTile) => + // println(s"======= RadianceCluster: tile.smemXbar.node.edge = ${tile.smemXbar.node.out.size}") + println(s"======= RadianceCluster: clbus inward edges = ${outer.clbus.inwardNode.inward.inputs.size}") + println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}") + } + + outer.perSmemPortXbars(0).node.out(0)._2.slave.slaves(0).address.foreach { addrSet => + println(s"====== perSmemPortXbars(0).slaves(0).addr: ${addrSet.toString()}") + } + outer.perSmemPortXbars(0).node.out(0)._2.master.masters(0).visibility.foreach { addrSet => + println(s"====== perSmemPortXbars(0).masters(0).addr: ${addrSet.toString()}") } } diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 4b6a9be..570efb8 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -143,8 +143,8 @@ class RadianceTile private ( "SIMTCoreKey not defined; make sure to use WithSimtConfig when using RadianceTile" ) - // NOTE: when changing these, remember to change +define+NUM_THREADS/WARPS in - // EXTRA_SIM_PREPROC_DEFINES as well! + // NOTE: when changing these, remember to change +define+NUM_CORES/THREADS/WARPS in + // radiance.mk as well! val numWarps = p(SIMTCoreKey) match { case Some(simtParam) => simtParam.nWarps case None => 4 @@ -303,9 +303,6 @@ class RadianceTile private ( // Conditionally instantiate L1 cache val (icacheNode, dcacheNode): (TLNode, TLNode) = p(VortexL1Key) match { case Some(vortexL1Config) => { - println( - s"============ Using Vortex L1 cache =================" - ) // require( // p(CoalescerKey).isDefined, // "Vortex L1 configuration currently only works when coalescer is also enabled." @@ -328,23 +325,6 @@ class RadianceTile private ( } } - // Instantiate sharedmem banks - // - // Instantiate the same number of banks as there are lanes. - // TODO: parametrize - // val smemBanks = Seq.tabulate(numLsuLanes) { bankId => - // // Banked-by-word (4 bytes) - // // base for bank 1: ff...000000|01|00 - // // mask for bank 1; 00...111111|00|11 - // val base = 0xff000000L | (bankId * 4 /*wordSize*/ ) - // val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ ) - // LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = 4 /*wordSize*/ )) - // } - // smem lanes-to-banks crossbar - val smemXbar = LazyModule(new TLXbar) - smemNodes.foreach(smemXbar.node := _) - // smemBanks.foreach(_.node := smemXbar.node) - val base = p(GPUMemory()) match { case Some(GPUMemParams(baseAddr, _)) => baseAddr case _ => BigInt(0) @@ -361,32 +341,32 @@ class RadianceTile private ( // ROCC // TODO: parametrize - val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy( - has_training_convs = false, - has_max_pool = false, - use_tl_ext_mem = true, - tl_ext_mem_base = x"ff000000", - sp_singleported = false, - spad_read_delay = 8, - use_shared_ext_mem = true, - acc_sub_banks = 1, - has_normalizations = false, - sp_capacity = CapacityInKilobytes(16), - acc_capacity = CapacityInKilobytes(8), - ))) - val roccs: Seq[LazyRoCC] = Seq(gemmini) - tlMasterXbar.node :=* AddressOrNode(base) :=* gemmini.atlNode - tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode + // val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy( + // has_training_convs = false, + // has_max_pool = false, + // use_tl_ext_mem = true, + // tl_ext_mem_base = x"ff000000", + // sp_singleported = false, + // spad_read_delay = 8, + // use_shared_ext_mem = true, + // acc_sub_banks = 1, + // has_normalizations = false, + // sp_capacity = CapacityInKilobytes(16), + // acc_capacity = CapacityInKilobytes(8), + // ))) + // val roccs: Seq[LazyRoCC] = Seq(gemmini) + // tlMasterXbar.node :=* AddressOrNode(base) :=* gemmini.atlNode + // tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode // MMIO - gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node + // gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node // sharedmem access // // FIXME: gemmini spad has 16B data width; core smem interface has 4B. Need // to consolidate by either coalescing, or changing gemmini spad to // strided-by-word - gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node - TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node + // gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node + // TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node /* below are copied from rocket */ @@ -700,33 +680,33 @@ class RadianceTileModuleImp(outer: RadianceTile) // TODO: generalize for useVxCache if (!outer.radianceParams.useVxCache) {} - // RoCC - if (outer.roccs.size > 0) { - val (respArb, cmdRouter) = { - val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size)) - val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p)) - outer.roccs.zipWithIndex.foreach { case (rocc, i) => - // ptwPorts ++= rocc.module.io.ptw - rocc.module.io.ptw <> DontCare - rocc.module.io.mem <> DontCare - rocc.module.io.cmd <> cmdRouter.io.out(i) - respArb.io.in(i) <> Queue(rocc.module.io.resp) - } - // Create this FPU just for RoCC - // val nFPUPorts = outer.roccs.filter(_.usesFPU).size - val fp_rocc_ios = outer.roccs.map(_.module.io) - fp_rocc_ios.map { io => - io.fpu_req.ready := false.B - io.fpu_resp.valid := false.B - io.fpu_resp.bits := DontCare - } - (respArb, cmdRouter) - } + // // RoCC + // if (outer.roccs.size > 0) { + // val (respArb, cmdRouter) = { + // val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size)) + // val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p)) + // outer.roccs.zipWithIndex.foreach { case (rocc, i) => + // // ptwPorts ++= rocc.module.io.ptw + // rocc.module.io.ptw <> DontCare + // rocc.module.io.mem <> DontCare + // rocc.module.io.cmd <> cmdRouter.io.out(i) + // respArb.io.in(i) <> Queue(rocc.module.io.resp) + // } + // // Create this FPU just for RoCC + // // val nFPUPorts = outer.roccs.filter(_.usesFPU).size + // val fp_rocc_ios = outer.roccs.map(_.module.io) + // fp_rocc_ios.map { io => + // io.fpu_req.ready := false.B + // io.fpu_resp.valid := false.B + // io.fpu_resp.bits := DontCare + // } + // (respArb, cmdRouter) + // } - cmdRouter.io.in <> DontCare - outer.roccs.foreach(_.module.io.exception := DontCare) - respArb.io.out <> DontCare - } + // cmdRouter.io.in <> DontCare + // outer.roccs.foreach(_.module.io.exception := DontCare) + // respArb.io.out <> DontCare + // } } // Some @copypaste from CoalescerSourceGen. From 0fa2712897d61bc8840a9faf41358aca41933e92 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 4 Mar 2024 21:10:19 -0800 Subject: [PATCH 06/23] Add perf counters for smem/dmem latency --- .../scala/radiance/tile/RadianceTile.scala | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 570efb8..1685288 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -617,6 +617,28 @@ class RadianceTileModuleImp(outer: RadianceTile) } core.io.dmem_d_valid := dmem_d_valid_vec.asUInt + // performance counters + val pendingReqsCumulative = RegInit(UInt(32.W), 0.U) + val totalReqs = RegInit(UInt(32.W), 0.U) + + val reqFireCountPerCycle = PopCount(dmemTLAdapters.map(_.io.inReq.fire)) + val respFireCountPerCycle = PopCount(dmemTLAdapters.map(_.io.inResp.fire)) + totalReqs := totalReqs + reqFireCountPerCycle + + val pendingReqsPerCycle = reqFireCountPerCycle - respFireCountPerCycle + pendingReqsCumulative := pendingReqsCumulative + pendingReqsPerCycle + + val prevFinished = RegNext(core.io.finished) + val justFinished = !prevFinished && core.io.finished + when (justFinished) { + printf("PERF: dmem: pending requests cumulative: %d\n", pendingReqsCumulative) + printf("PERF: dmem: total requests: %d\n", totalReqs) + } + + dontTouch(totalReqs) + dontTouch(pendingReqsCumulative) + + // now connect TL adapter downstream ports to the tile egress ports (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) => tlOut.a <> tlAdapter.io.outReq tlAdapter.io.outResp <> tlOut.d @@ -666,6 +688,28 @@ class RadianceTileModuleImp(outer: RadianceTile) tlAdapter.io.inResp.ready := core.io.smem_d_ready(i) } + // performance counters + val pendingReqsCumulative = RegInit(UInt(32.W), 0.U) + val totalReqs = RegInit(UInt(32.W), 0.U) + + val reqFireCountPerCycle = PopCount(smemTLAdapters.map(_.io.inReq.fire)) + val respFireCountPerCycle = PopCount(smemTLAdapters.map(_.io.inResp.fire)) + totalReqs := totalReqs + reqFireCountPerCycle + + val pendingReqsPerCycle = reqFireCountPerCycle - respFireCountPerCycle + pendingReqsCumulative := pendingReqsCumulative + pendingReqsPerCycle + + val prevFinished = RegNext(core.io.finished) + val justFinished = !prevFinished && core.io.finished + when (justFinished) { + printf("PERF: smem: pending requests cumulative: %d\n", pendingReqsCumulative) + printf("PERF: smem: total requests: %d\n", totalReqs) + } + + dontTouch(totalReqs) + dontTouch(pendingReqsCumulative) + + // now connect TL adapter downstream ports to the tile egress ports (smemTLAdapters zip smemTLBundles) foreach { case (tlAdapter, tlOut) => tlOut.a <> tlAdapter.io.outReq tlAdapter.io.outResp <> tlOut.d From 7aacd21b42130b035e7565524e7d04c62ae385a9 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 15:01:08 -0800 Subject: [PATCH 07/23] Bump vortex with upstream merge --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index eb63767..010c467 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit eb63767051779ddb0827746ac03287b009af2a5c +Subproject commit 010c4675ceebb6cf5be9172d8e0916a626762e24 From 34de55ee133b5a17d68016467471e68cf42353b0 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 15:01:39 -0800 Subject: [PATCH 08/23] Fix missing Vortex modules for upstream merge; add perf modules --- src/main/scala/radiance/tile/VortexCore.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index e468ee3..be4c956 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -217,6 +217,10 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_define.vh") // addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_wrap.sv") + // used when PERF_ENABLE is defined + addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_perf_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_pipeline_perf_if.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv") @@ -245,6 +249,9 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_mux.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_pending_size.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_register.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_buffer.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_toggle_buffer.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_buffer.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_popcount.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_arbiter.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_encoder.sv") From 56ef9012eece1f1c99af6545e93da3d32336293b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 15:02:43 -0800 Subject: [PATCH 09/23] Streamline perf counter code --- .../scala/radiance/tile/RadianceTile.scala | 76 +++++++++---------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 1685288..e442382 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -524,6 +524,38 @@ class RadianceTileModuleImp(outer: RadianceTile) imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d } + def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]], + respBundles: Seq[DecoupledIO[VortexBundleD]], + desc: String) = { + val currentPendingReqs = RegInit(SInt(32.W), 0.S) + val pendingReqsCumulative = RegInit(SInt(32.W), 0.S) + val totalReqs = RegInit(UInt(32.W), 0.U) + + val reqFireCountPerCycle = Wire(UInt(32.W)) + val respFireCountPerCycle = Wire(UInt(32.W)) + val reqReadFires = reqBundles.map { b => b.fire && b.bits.opcode === 4.U /* Get */ } + val respReadFires = respBundles.map { b => b.fire && b.bits.opcode === 1.U /* AccessAckData */} + reqFireCountPerCycle := PopCount(reqReadFires) + respFireCountPerCycle := PopCount(respReadFires) + totalReqs := totalReqs + reqFireCountPerCycle + + val diffPendingReqs = reqFireCountPerCycle.asSInt - respFireCountPerCycle.asSInt + currentPendingReqs := currentPendingReqs + diffPendingReqs + pendingReqsCumulative := pendingReqsCumulative + currentPendingReqs + + val prevFinished = RegNext(core.io.finished) + val justFinished = !prevFinished && core.io.finished + when (justFinished) { + printf(s"PERF: ${desc}: pending requests cumulative: %d\n", pendingReqsCumulative) + printf(s"PERF: ${desc}: total requests: %d\n", totalReqs) + } + + dontTouch(totalReqs) + dontTouch(diffPendingReqs) + dontTouch(currentPendingReqs) + dontTouch(pendingReqsCumulative) + } + def connectDmem = { // @perf: this would duplicate SourceGenerator table for every lane and eat // up some area @@ -617,26 +649,8 @@ class RadianceTileModuleImp(outer: RadianceTile) } core.io.dmem_d_valid := dmem_d_valid_vec.asUInt - // performance counters - val pendingReqsCumulative = RegInit(UInt(32.W), 0.U) - val totalReqs = RegInit(UInt(32.W), 0.U) - - val reqFireCountPerCycle = PopCount(dmemTLAdapters.map(_.io.inReq.fire)) - val respFireCountPerCycle = PopCount(dmemTLAdapters.map(_.io.inResp.fire)) - totalReqs := totalReqs + reqFireCountPerCycle - - val pendingReqsPerCycle = reqFireCountPerCycle - respFireCountPerCycle - pendingReqsCumulative := pendingReqsCumulative + pendingReqsPerCycle - - val prevFinished = RegNext(core.io.finished) - val justFinished = !prevFinished && core.io.finished - when (justFinished) { - printf("PERF: dmem: pending requests cumulative: %d\n", pendingReqsCumulative) - printf("PERF: dmem: total requests: %d\n", totalReqs) - } - - dontTouch(totalReqs) - dontTouch(pendingReqsCumulative) + performanceCounters(dmemTLAdapters.map(_.io.inReq), dmemTLAdapters.map(_.io.inResp), + desc = s"core${outer.tileId}-dmem") // now connect TL adapter downstream ports to the tile egress ports (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) => @@ -688,26 +702,8 @@ class RadianceTileModuleImp(outer: RadianceTile) tlAdapter.io.inResp.ready := core.io.smem_d_ready(i) } - // performance counters - val pendingReqsCumulative = RegInit(UInt(32.W), 0.U) - val totalReqs = RegInit(UInt(32.W), 0.U) - - val reqFireCountPerCycle = PopCount(smemTLAdapters.map(_.io.inReq.fire)) - val respFireCountPerCycle = PopCount(smemTLAdapters.map(_.io.inResp.fire)) - totalReqs := totalReqs + reqFireCountPerCycle - - val pendingReqsPerCycle = reqFireCountPerCycle - respFireCountPerCycle - pendingReqsCumulative := pendingReqsCumulative + pendingReqsPerCycle - - val prevFinished = RegNext(core.io.finished) - val justFinished = !prevFinished && core.io.finished - when (justFinished) { - printf("PERF: smem: pending requests cumulative: %d\n", pendingReqsCumulative) - printf("PERF: smem: total requests: %d\n", totalReqs) - } - - dontTouch(totalReqs) - dontTouch(pendingReqsCumulative) + performanceCounters(smemTLAdapters.map(_.io.inReq), smemTLAdapters.map(_.io.inResp), + desc = s"core${outer.tileId}-smem") // now connect TL adapter downstream ports to the tile egress ports (smemTLAdapters zip smemTLBundles) foreach { case (tlAdapter, tlOut) => From d0ba68852e3684e89a7652da96193d7add3b5687 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 15:58:24 -0800 Subject: [PATCH 10/23] Relax timeout for stale srcId in NewSourceGenerator --- src/main/scala/radiance/memory/VortexCache.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/radiance/memory/VortexCache.scala b/src/main/scala/radiance/memory/VortexCache.scala index e3015ce..480b0d4 100644 --- a/src/main/scala/radiance/memory/VortexCache.scala +++ b/src/main/scala/radiance/memory/VortexCache.scala @@ -606,7 +606,7 @@ class NewSourceGenerator[T <: Data]( oldestMetadata := occupancyTable(oldestIndex).meta oldestAge := occupancyTable(oldestIndex).age assert( - oldestAge <= 2000.U, + oldestAge <= 10000.U, "One id in the SourceGen is not released for long time, potential bug !" ) From c2d4adb70cb67f59183cb57c7eccc401d933fe10 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 16:01:03 -0800 Subject: [PATCH 11/23] Define PERF_ENABLE --- radiance.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/radiance.mk b/radiance.mk index 1a37e22..4702e58 100644 --- a/radiance.mk +++ b/radiance.mk @@ -17,6 +17,7 @@ EXTRA_SIM_PREPROC_DEFINES += \ +define+GPR_RESET \ +define+LSU_DUP_DISABLE \ +define+DBG_TRACE_CORE_PIPELINE_VCS \ + +define+PERF_ENABLE \ +define+NUM_THREADS=8 +define+NUM_WARPS=8 # cargo handles building of Rust files all on its own, so make this a PHONY From 759b96bcef6f410ed3e43e16115304aa70659bb0 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 16:01:42 -0800 Subject: [PATCH 12/23] Define ICACHE_DISABLE/DCACHE_DISABLE to keep with upstream merge Upstream got a change that defines L1_ENABLE when {I,D}CACHE_ENABLE is defined, which they are by default. It also define-gates some of the ibuffer code with L1_ENABLE which breaks the sim. Explicitly defining these flags prevent this from happening. --- radiance.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/radiance.mk b/radiance.mk index 4702e58..0f0e29a 100644 --- a/radiance.mk +++ b/radiance.mk @@ -18,6 +18,7 @@ EXTRA_SIM_PREPROC_DEFINES += \ +define+LSU_DUP_DISABLE \ +define+DBG_TRACE_CORE_PIPELINE_VCS \ +define+PERF_ENABLE \ + +define+ICACHE_DISABLE +define+DCACHE_DISABLE \ +define+NUM_THREADS=8 +define+NUM_WARPS=8 # cargo handles building of Rust files all on its own, so make this a PHONY From 469c0fe962144e59d1b9fadae8ae54788926a76f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 17:18:02 -0800 Subject: [PATCH 13/23] Parameterize cache size in VortexL1Cache --- src/main/scala/radiance/memory/VortexCache.scala | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main/scala/radiance/memory/VortexCache.scala b/src/main/scala/radiance/memory/VortexCache.scala index 480b0d4..75fa9f1 100644 --- a/src/main/scala/radiance/memory/VortexCache.scala +++ b/src/main/scala/radiance/memory/VortexCache.scala @@ -10,6 +10,7 @@ import org.chipsalliance.cde.config.{Parameters, Field} case object VortexL1Key extends Field[Option[VortexL1Config]](None /*default*/ ) case class VortexL1Config( + cacheSize: Int, // total cache size in bytes numBanks: Int, wordSize: Int, // This is the read/write granularity of the L1 cache cacheLineSize: Int, @@ -34,6 +35,7 @@ case class VortexL1Config( object defaultVortexL1Config extends VortexL1Config( + cacheSize = 16384, numBanks = 4, wordSize = 16, cacheLineSize = 16, @@ -203,6 +205,7 @@ class VortexBankImp( val vxCache = Module( new VX_cache_top( WORD_SIZE = config.wordSize, + CACHE_SIZE = config.cacheSize / config.numBanks, CACHE_LINE_SIZE = config.cacheLineSize, CORE_TAG_WIDTH = config.coreTagPlusSizeWidth, MSHR_SIZE = config.mshrSize @@ -389,7 +392,7 @@ class VortexBankImp( class VX_cache_top( // these values should match the default settings in Verilog // TODO: INSTANCE_ID - CACHE_SIZE: Int = 16384 / 4, // 1, "CACHE_SIZE" -> CACHE_SIZE, "LINE_SIZE" -> CACHE_LINE_SIZE, From 396fddf437bb68535908cf32764f029e79fdbcac Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 17:18:40 -0800 Subject: [PATCH 14/23] Add latency perf counter to imem --- .../scala/radiance/tile/RadianceTile.scala | 69 ++++++++++--------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index e442382..99f89f7 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -520,42 +520,15 @@ class RadianceTileModuleImp(outer: RadianceTile) // TODO: make imemNodes not a vector imemTLAdapter.io.inReq <> core.io.imem.get(0).a core.io.imem.get(0).d <> imemTLAdapter.io.inResp + + performanceCounters(Seq(imemTLAdapter.io.inReq), Seq(imemTLAdapter.io.inResp), + desc = s"core${outer.tileId}-imem") + + // now connect TL adapter downstream ports to the tile egress ports outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d } - def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]], - respBundles: Seq[DecoupledIO[VortexBundleD]], - desc: String) = { - val currentPendingReqs = RegInit(SInt(32.W), 0.S) - val pendingReqsCumulative = RegInit(SInt(32.W), 0.S) - val totalReqs = RegInit(UInt(32.W), 0.U) - - val reqFireCountPerCycle = Wire(UInt(32.W)) - val respFireCountPerCycle = Wire(UInt(32.W)) - val reqReadFires = reqBundles.map { b => b.fire && b.bits.opcode === 4.U /* Get */ } - val respReadFires = respBundles.map { b => b.fire && b.bits.opcode === 1.U /* AccessAckData */} - reqFireCountPerCycle := PopCount(reqReadFires) - respFireCountPerCycle := PopCount(respReadFires) - totalReqs := totalReqs + reqFireCountPerCycle - - val diffPendingReqs = reqFireCountPerCycle.asSInt - respFireCountPerCycle.asSInt - currentPendingReqs := currentPendingReqs + diffPendingReqs - pendingReqsCumulative := pendingReqsCumulative + currentPendingReqs - - val prevFinished = RegNext(core.io.finished) - val justFinished = !prevFinished && core.io.finished - when (justFinished) { - printf(s"PERF: ${desc}: pending requests cumulative: %d\n", pendingReqsCumulative) - printf(s"PERF: ${desc}: total requests: %d\n", totalReqs) - } - - dontTouch(totalReqs) - dontTouch(diffPendingReqs) - dontTouch(currentPendingReqs) - dontTouch(pendingReqsCumulative) - } - def connectDmem = { // @perf: this would duplicate SourceGenerator table for every lane and eat // up some area @@ -712,6 +685,38 @@ class RadianceTileModuleImp(outer: RadianceTile) } } + def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]], + respBundles: Seq[DecoupledIO[VortexBundleD]], + desc: String) = { + val currentPendingReqs = RegInit(SInt(32.W), 0.S) + val pendingReqsCumulative = RegInit(SInt(32.W), 0.S) + val totalReqs = RegInit(UInt(32.W), 0.U) + + val reqFireCountPerCycle = Wire(UInt(32.W)) + val respFireCountPerCycle = Wire(UInt(32.W)) + val reqReadFires = reqBundles.map { b => b.fire && b.bits.opcode === 4.U /* Get */ } + val respReadFires = respBundles.map { b => b.fire && b.bits.opcode === 1.U /* AccessAckData */} + reqFireCountPerCycle := PopCount(reqReadFires) + respFireCountPerCycle := PopCount(respReadFires) + totalReqs := totalReqs + reqFireCountPerCycle + + val diffPendingReqs = reqFireCountPerCycle.asSInt - respFireCountPerCycle.asSInt + currentPendingReqs := currentPendingReqs + diffPendingReqs + pendingReqsCumulative := pendingReqsCumulative + currentPendingReqs + + val prevFinished = RegNext(core.io.finished) + val justFinished = !prevFinished && core.io.finished + when (justFinished) { + printf(s"PERF: ${desc}: pending requests cumulative: %d\n", pendingReqsCumulative) + printf(s"PERF: ${desc}: total requests: %d\n", totalReqs) + } + + dontTouch(totalReqs) + dontTouch(diffPendingReqs) + dontTouch(currentPendingReqs) + dontTouch(pendingReqsCumulative) + } + connectImem connectDmem connectSmem From 8680c64e5b502864843182f300984f519fc42d58 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 17:18:59 -0800 Subject: [PATCH 15/23] Force numBanks = 1 for icache Given that the number of imem port is 1 and imem access has good spatial locality, force single bank config for icache. --- src/main/scala/radiance/tile/RadianceTile.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 99f89f7..f79071f 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -303,12 +303,13 @@ class RadianceTile private ( // Conditionally instantiate L1 cache val (icacheNode, dcacheNode): (TLNode, TLNode) = p(VortexL1Key) match { case Some(vortexL1Config) => { + println("VortexL1Cache instantiated") // require( // p(CoalescerKey).isDefined, // "Vortex L1 configuration currently only works when coalescer is also enabled." // ) - val icache = LazyModule(new VortexL1Cache(vortexL1Config)) + val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy(numBanks = 1))) val dcache = LazyModule(new VortexL1Cache(vortexL1Config)) // imemNodes.foreach { icache.coresideNode := TLWidthWidget(4) := _ } assert(imemNodes.length == 1) // FIXME From eca36193803eae7ac0ced3d0cd5612b0edef1974 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 7 Mar 2024 17:44:21 -0800 Subject: [PATCH 16/23] Bump vortex --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 010c467..8317a3f 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 010c4675ceebb6cf5be9172d8e0916a626762e24 +Subproject commit 8317a3fbe57e917653f6982b3db3d9a256313f2d From 49dfb5b97a5574ed8683ca869b27504c6fb60405 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 14 Mar 2024 16:45:11 -0700 Subject: [PATCH 17/23] Reformat perfcounter report --- src/main/scala/radiance/tile/RadianceTile.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index f79071f..9f8424b 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -708,8 +708,8 @@ class RadianceTileModuleImp(outer: RadianceTile) val prevFinished = RegNext(core.io.finished) val justFinished = !prevFinished && core.io.finished when (justFinished) { - printf(s"PERF: ${desc}: pending requests cumulative: %d\n", pendingReqsCumulative) - printf(s"PERF: ${desc}: total requests: %d\n", totalReqs) + printf(s"PERF: ${desc}: average request latency (cum_pending / total): %d / %d\n", + pendingReqsCumulative, totalReqs) } dontTouch(totalReqs) From afc209a28b8c9ac769cca03e14f081eae3ec9c41 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 14 Mar 2024 16:46:41 -0700 Subject: [PATCH 18/23] Create MMIO regs for software-implemented cluster-wide barrier --- .../scala/radiance/memory/VortexCache.scala | 5 +- .../scala/radiance/tile/RadianceCluster.scala | 68 ++++++++++++++----- 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/src/main/scala/radiance/memory/VortexCache.scala b/src/main/scala/radiance/memory/VortexCache.scala index 75fa9f1..61a841c 100644 --- a/src/main/scala/radiance/memory/VortexCache.scala +++ b/src/main/scala/radiance/memory/VortexCache.scala @@ -100,7 +100,7 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters) TLMasterPortParameters.v1( clients = Seq( TLMasterParameters.v1( - name = "VortexBank", + name = "VortexBankPassthrough", sourceId = IdRange( 0, 1 << (log2Ceil( @@ -175,7 +175,7 @@ class VortexBank( TLMasterPortParameters.v1( clients = Seq( TLMasterParameters.v1( - name = "VortexBank", + name = s"VortexBank${bankId}", sourceId = IdRange(0, config.memSideSourceIds), supportsProbe = TransferSizes(1, config.wordSize), supportsGet = TransferSizes(1, config.wordSize), @@ -205,6 +205,7 @@ class VortexBankImp( val vxCache = Module( new VX_cache_top( WORD_SIZE = config.wordSize, + // distribute total size across numBanks CACHE_SIZE = config.cacheSize / config.numBanks, CACHE_LINE_SIZE = config.cacheLineSize, CORE_TAG_WIDTH = config.coreTagPlusSizeWidth, diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 69499ec..b0c175e 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -9,7 +9,8 @@ import chisel3.util._ import org.chipsalliance.cde.config.{Field, Parameters} import freechips.rocketchip.subsystem._ import freechips.rocketchip.tilelink._ -import freechips.rocketchip.diplomacy.{LazyModule, AddressSet, ClockCrossingType} +import freechips.rocketchip.diplomacy.{LazyModule, AddressSet, SimpleDevice, ClockCrossingType} +import freechips.rocketchip.regmapper.RegField import freechips.rocketchip.prci.ClockSinkParameters case class RadianceClusterParams( @@ -49,22 +50,32 @@ class RadianceCluster ( } smemBanks.foreach(_.node := clbus.outwardNode) - // HACK: This is a work around the normal bus connecting API by downcasting - // tile and directly accessing the node inside that is not exposed as a - // master in HierarchicalElementCrossingParamsLike. - val tile = leafTiles(0).asInstanceOf[RadianceTile] - val perSmemPortXbars = Seq.fill(tile.smemNodes.size) { LazyModule(new TLXbar) } + // HACK: This is a workaround of the CanAttachTile bus connecting API that + // works by downcasting tile and directly accessing the node inside that is + // not exposed as a master in HierarchicalElementCrossingParamsLike. + // val tile = leafTiles(0).asInstanceOf[RadianceTile] + // val perSmemPortXbars = Seq.fill(tile.smemNodes.size) { LazyModule(new TLXbar) } // Tie corresponding smem ports from every tile into a single port using // Xbars so that the number of ports going into the sharedmem do not scale // with the number of tiles. leafTiles.foreach { case (id, tile: RadianceTile) => - (perSmemPortXbars zip tile.smemNodes).foreach { - case (xbar, node) => xbar.node := node - } - // tile.smemNodes.foreach (clbus.inwardNode := _) + // (perSmemPortXbars zip tile.smemNodes).foreach { + // case (xbar, node) => xbar.node := node + // } + tile.smemNodes.foreach (clbus.inwardNode := _) } - perSmemPortXbars.foreach { clbus.inwardNode := _.node } + // perSmemPortXbars.foreach { clbus.inwardNode := _.node } + + // Memory-mapped register for barrier synchronization + val regDevice = new SimpleDevice("radiance-cluster-barrier-reg", + Seq(s"radiance-cluster-barrier-reg${clusterId}")) + val regNode = TLRegisterNode( + address = Seq(AddressSet(0xff003f00L, 0xff)), + device = regDevice, + beatBytes = wordSize, + concurrency = 1) + regNode := clbus.outwardNode override lazy val module = new RadianceClusterModuleImp(this) } @@ -72,14 +83,37 @@ class RadianceCluster ( class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp(outer) { outer.leafTiles.foreach { case (id, tile: RadianceTile) => // println(s"======= RadianceCluster: tile.smemXbar.node.edge = ${tile.smemXbar.node.out.size}") - println(s"======= RadianceCluster: clbus inward edges = ${outer.clbus.inwardNode.inward.inputs.size}") + println(s"======= RadianceCluster: clbus inward edges = ${outer.clbus.inwardNode.inward.inputs.length}") println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}") } - outer.perSmemPortXbars(0).node.out(0)._2.slave.slaves(0).address.foreach { addrSet => - println(s"====== perSmemPortXbars(0).slaves(0).addr: ${addrSet.toString()}") - } - outer.perSmemPortXbars(0).node.out(0)._2.master.masters(0).visibility.foreach { addrSet => - println(s"====== perSmemPortXbars(0).masters(0).addr: ${addrSet.toString()}") + val numCores = outer.leafTiles.size + val numBarriers = 4 // FIXME: hardcoded + val allSyncedRegs = Seq.fill(numBarriers)(Wire(UInt(32.W))) + val perCoreSyncedRegs = Seq.fill(numBarriers)(Seq.fill(numCores)(RegInit(0.U(32.W)))) + (allSyncedRegs zip perCoreSyncedRegs).foreach{ case (all, per) => + all := per.reduce((x0, x1) => (x0 =/= 0.U) && (x1 =/= 0.U)) + + val allPassed = per.map(_ === 2.U).reduce(_ && _) + when(allPassed) { + per.foreach(_ := 0.U) + } + + dontTouch(all) } + // FIXME: 4 cores per cluster hardcoded + outer.regNode.regmap( + 0x00 -> Seq(RegField.r(32, allSyncedRegs(0))), + 0x04 -> Seq(RegField(32, perCoreSyncedRegs(0)(0))), + 0x08 -> Seq(RegField(32, perCoreSyncedRegs(0)(1))), + 0x10 -> Seq(RegField.r(32, allSyncedRegs(1))), + 0x14 -> Seq(RegField(32, perCoreSyncedRegs(1)(0))), + 0x18 -> Seq(RegField(32, perCoreSyncedRegs(1)(1))), + 0x20 -> Seq(RegField.r(32, allSyncedRegs(2))), + 0x24 -> Seq(RegField(32, perCoreSyncedRegs(2)(0))), + 0x28 -> Seq(RegField(32, perCoreSyncedRegs(2)(1))), + 0x30 -> Seq(RegField.r(32, allSyncedRegs(3))), + 0x34 -> Seq(RegField(32, perCoreSyncedRegs(3)(0))), + 0x38 -> Seq(RegField(32, perCoreSyncedRegs(3)(1))), + ) } From 92069099a2be8f87b80065ceca23dba8f718cdba Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 14 Mar 2024 16:47:39 -0700 Subject: [PATCH 19/23] Bump vortex with LSU fix --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 8317a3f..28f54bd 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 8317a3fbe57e917653f6982b3db3d9a256313f2d +Subproject commit 28f54bde7fd5b895e9ccea6dffab35c87aa73efc From b5074f55176d8a583c62d588f600e309f3ccc1b4 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 17 Mar 2024 14:12:10 -0700 Subject: [PATCH 20/23] Connect core gbar signals to cluster via Diplomacy --- .../scala/radiance/tile/RadianceCluster.scala | 71 ++++++++++++++++++- .../scala/radiance/tile/RadianceTile.scala | 38 +++++++++- src/main/scala/radiance/tile/VortexCore.scala | 30 ++++---- 3 files changed, 123 insertions(+), 16 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index b0c175e..2d67a24 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -4,12 +4,13 @@ package radiance.tile import chisel3._ +import chisel3.experimental.SourceInfo import chisel3.util._ import org.chipsalliance.cde.config.{Field, Parameters} import freechips.rocketchip.subsystem._ import freechips.rocketchip.tilelink._ -import freechips.rocketchip.diplomacy.{LazyModule, AddressSet, SimpleDevice, ClockCrossingType} +import freechips.rocketchip.diplomacy._ import freechips.rocketchip.regmapper.RegField import freechips.rocketchip.prci.ClockSinkParameters @@ -50,6 +51,11 @@ class RadianceCluster ( } smemBanks.foreach(_.node := clbus.outwardNode) + val numCores = leafTiles.size + + // Diplomacy sink nodes for cluster-wide barrier sync signal + val barrierSlaveNode = BarrierSlaveNode(numCores) + // HACK: This is a workaround of the CanAttachTile bus connecting API that // works by downcasting tile and directly accessing the node inside that is // not exposed as a master in HierarchicalElementCrossingParamsLike. @@ -63,11 +69,12 @@ class RadianceCluster ( // (perSmemPortXbars zip tile.smemNodes).foreach { // case (xbar, node) => xbar.node := node // } - tile.smemNodes.foreach (clbus.inwardNode := _) + tile.smemNodes.foreach(clbus.inwardNode := _) + barrierSlaveNode := tile.barrierMasterNode } // perSmemPortXbars.foreach { clbus.inwardNode := _.node } - // Memory-mapped register for barrier synchronization + // Memory-mapped register for barrier sync val regDevice = new SimpleDevice("radiance-cluster-barrier-reg", Seq(s"radiance-cluster-barrier-reg${clusterId}")) val regNode = TLRegisterNode( @@ -77,6 +84,10 @@ class RadianceCluster ( concurrency = 1) regNode := clbus.outwardNode + nodes.foreach({ node => + println(s"======= RadianceCluster node.name: ${node.name}") + }) + override lazy val module = new RadianceClusterModuleImp(this) } @@ -87,6 +98,16 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}") } + outer.barrierSlaveNode.in.foreach { case (b, e) => + b.req.ready := true.B // barrier module is always ready + b.resp.valid := 0.U + b.resp.bits.barrierId := 0.U + } + + auto.elements.foreach({case (name, _) => + println(s"======= RadianceCluster.elements.name: ${name}") + }) + val numCores = outer.leafTiles.size val numBarriers = 4 // FIXME: hardcoded val allSyncedRegs = Seq.fill(numBarriers)(Wire(UInt(32.W))) @@ -116,4 +137,48 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( 0x34 -> Seq(RegField(32, perCoreSyncedRegs(3)(0))), 0x38 -> Seq(RegField(32, perCoreSyncedRegs(3)(1))), ) + + println(s"======== barrierSlaveNode: ${outer.barrierSlaveNode.in(0)._2.barrierIdBits}") } + +case class EmptyParams() + +case class BarrierParams( + barrierIdBits: Int, + numCoreBits: Int +) + +class BarrierRequestBits( + param: BarrierParams +) extends Bundle { + val barrierId = UInt(param.barrierIdBits.W) + val sizeMinusOne = UInt(param.numCoreBits.W) + val coreId = UInt(param.numCoreBits.W) +} + +class BarrierResponseBits( + param: BarrierParams +) extends Bundle { + val barrierId = UInt(param.barrierIdBits.W) +} + +class BarrierBundle(param: BarrierParams) extends Bundle { + val req = Decoupled(new BarrierRequestBits(param)) + val resp = Flipped(Decoupled(new BarrierResponseBits(param))) +} + +// FIXME Separate BarrierEdgeParams from BarrierParams +object BarrierNodeImp extends SimpleNodeImp[BarrierParams, EmptyParams, BarrierParams, BarrierBundle] { + def edge(pd: BarrierParams, pu: EmptyParams, p: Parameters, sourceInfo: SourceInfo) = { + // barrier parameters flow strictly downward from the master node + pd + } + def bundle(e: BarrierParams) = new BarrierBundle(e) + // FIXME render + def render(e: BarrierParams) = RenderedEdge(colour = "ffffff", label = "X") +} + +case class BarrierMasterNode(val srcParams: BarrierParams)(implicit valName: ValName) + extends SourceNode(BarrierNodeImp)(Seq(srcParams)) +case class BarrierSlaveNode(val numEdges: Int)(implicit valName: ValName) + extends SinkNode(BarrierNodeImp)(Seq.fill(numEdges)(EmptyParams())) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 9f8424b..c9fe609 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -326,6 +326,11 @@ class RadianceTile private ( } } + // Barrier synchronization node + // FIXME: hardcoded params + val barrierParams = BarrierParams(barrierIdBits = 2, numCoreBits = 1) + val barrierMasterNode = BarrierMasterNode(barrierParams) + val base = p(GPUMemory()) match { case Some(GPUMemParams(baseAddr, _)) => baseAddr case _ => BigInt(0) @@ -339,7 +344,6 @@ class RadianceTile private ( tlMasterXbar.node :=* AddressOrNode(base) :=* dcacheNode } - // ROCC // TODO: parametrize // val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy( @@ -451,6 +455,10 @@ class RadianceTileModuleImp(outer: RadianceTile) extends BaseTileModuleImp(outer) { Annotated.params(this, outer.radianceParams) + auto.elements.foreach({case (name, _) => + println(s"======= RadianceTile.elements.name: ${name}") + }) + val core = Module(new Vortex(outer)(outer.p)) core.io.clock := clock @@ -686,6 +694,17 @@ class RadianceTileModuleImp(outer: RadianceTile) } } + def connectBarrier = { + require(outer.barrierMasterNode.out.length == 1) + // FIXME: bits not flattened + outer.barrierMasterNode.out(0)._1.req.valid := core.io.gbar_req_valid + outer.barrierMasterNode.out(0)._1.req.bits.barrierId := core.io.gbar_req_id + outer.barrierMasterNode.out(0)._1.req.bits.coreId := core.io.gbar_req_core_id + core.io.gbar_req_ready := outer.barrierMasterNode.out(0)._1.req.ready + core.io.gbar_rsp_valid := outer.barrierMasterNode.out(0)._1.resp.valid + core.io.gbar_rsp_id := outer.barrierMasterNode.out(0)._1.resp.bits.barrierId + } + def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]], respBundles: Seq[DecoupledIO[VortexBundleD]], desc: String) = { @@ -721,6 +740,7 @@ class RadianceTileModuleImp(outer: RadianceTile) connectImem connectDmem connectSmem + connectBarrier } // TODO: generalize for useVxCache @@ -755,6 +775,22 @@ class RadianceTileModuleImp(outer: RadianceTile) // } } +class ClusterSynchronizer( + barrierIdWidth: Int, + numCoreWidth: Int, +) extends Module { + val io = IO(new Bundle { + val req = Flipped(Decoupled(new Bundle { + val barrierId = UInt(barrierIdWidth.W) + val sizeMinusOne = UInt(numCoreWidth.W) + val coreId = UInt(numCoreWidth.W) + })) + val resp = Decoupled(new Bundle { + val barrierId = UInt(barrierIdWidth.W) + }) + }) +} + // Some @copypaste from CoalescerSourceGen. class VortexTLAdapter( newSourceWidth: Int, diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index be4c956..7a37c24 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -41,18 +41,11 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val interrupts = Input(new freechips.rocketchip.rocket.CoreInterrupts(false/*hasBeu*/)) // conditionally instantiate ports depending on whether we want to use VX_cache or not + // TODO: flatten this like dmem and smem val imem = if (!tile.radianceParams.useVxCache) Some(Vec(1, new Bundle { val a = Decoupled(new VortexBundleA(tagWidth = tile.imemTagWidth, dataWidth = 32)) val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.imemTagWidth, dataWidth = 32))) })) else None - val dmem = if (!tile.radianceParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle { - // val a = Decoupled(new VortexBundleA(tagWidth = tile.dmemTagWidth, dataWidth = 32)) - // val d = Flipped(Decoupled(new VortexBundleD(tagWidth = dmemTagWidth, dataWidth = 32))) - })) else None - val smem = if (!tile.radianceParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle { - // val a = Decoupled(new VortexBundleA(tagWidth = tile.smemTagWidth, dataWidth = 32)) - // val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.smemTagWidth, dataWidth = 32))) - })) else None val mem = if (tile.radianceParams.useVxCache) Some(new Bundle { val a = Decoupled(new VortexBundleA(tagWidth = 15, dataWidth = 128)) val d = Flipped(Decoupled(new VortexBundleD(tagWidth = 15, dataWidth = 128))) @@ -96,6 +89,18 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val smem_d_bits_data = Input(UInt((tile.numLsuLanes * 32).W)) val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W)) + // FIXME: hardcoded + val numCoresPerCluster = 2 + val NB_WIDTH = 2 + val NC_WIDTH = 1 + val gbar_req_valid = Output(UInt((numCoresPerCluster * 1).W)) + val gbar_req_id = Output(UInt((numCoresPerCluster * NB_WIDTH).W)) + val gbar_req_size_m1 = Output(UInt((numCoresPerCluster * NC_WIDTH).W)) + val gbar_req_core_id = Output(UInt((numCoresPerCluster * NC_WIDTH).W)) + val gbar_req_ready = Input(UInt((numCoresPerCluster * 1).W)) + val gbar_rsp_valid = Input(UInt((numCoresPerCluster * 1).W)) + val gbar_rsp_id = Input(UInt((numCoresPerCluster * NB_WIDTH).W)) + // val fpu = Flipped(new FPUCoreIO()) //val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs)) //val trace = Output(new TraceBundle) @@ -112,6 +117,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // see VX_csr_data that implements the read logic for CSR_MHARTID/GWID. Map( "CORE_ID" -> tile.tileParams.tileId, + "CORES_PER_CLUSTER" -> 2, // FIXME: hardcoded // TODO: can we get this as a parameter? "BOOTROM_HANG100" -> 0x10100, "NUM_THREADS" -> tile.numLsuLanes @@ -194,10 +200,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_tags.sv") // addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_wrap.sv") - // gbar is only used in the socket/cluster hierarchy - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv") - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_bus_if.sv") - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv") // mem_arb is used in VX_socket or VX_cache_cluster // addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_arb.sv") addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_bus_if.sv") @@ -220,6 +222,10 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // used when PERF_ENABLE is defined addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_perf_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_pipeline_perf_if.sv") + // used when GBAR_ENABLE is defined + addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_bus_if.sv") + // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv") + // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv") From 9024048a52decee6daf6ca4d329f1e5900cb1a2b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 17 Mar 2024 14:13:33 -0700 Subject: [PATCH 21/23] Bump vortex --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 28f54bd..40e2888 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 28f54bde7fd5b895e9ccea6dffab35c87aa73efc +Subproject commit 40e288873376c325d6680b94c7f2a184e8fcb8ea From 0a4151e3cb224b5b477730b076760b1c63115fd8 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 18 Mar 2024 14:10:38 -0700 Subject: [PATCH 22/23] Add BarrierSynchronizer module --- src/main/scala/radiance/tile/Barrier.scala | 98 +++++++++++++++++++ .../scala/radiance/tile/RadianceCluster.scala | 74 ++++---------- .../scala/radiance/tile/RadianceTile.scala | 3 + src/main/scala/radiance/tile/VortexCore.scala | 15 ++- 4 files changed, 128 insertions(+), 62 deletions(-) create mode 100644 src/main/scala/radiance/tile/Barrier.scala diff --git a/src/main/scala/radiance/tile/Barrier.scala b/src/main/scala/radiance/tile/Barrier.scala new file mode 100644 index 0000000..e3e965b --- /dev/null +++ b/src/main/scala/radiance/tile/Barrier.scala @@ -0,0 +1,98 @@ +// See LICENSE.SiFive for license details. +// See LICENSE.Berkeley for license details. + +package radiance.tile + +import chisel3._ +import chisel3.experimental.SourceInfo +import chisel3.util._ + +import org.chipsalliance.cde.config.{Field, Parameters} +import freechips.rocketchip.subsystem._ +import freechips.rocketchip.diplomacy._ + +case class EmptyParams() + +case class BarrierParams( + barrierIdBits: Int, + numCoreBits: Int +) + +class BarrierRequestBits( + param: BarrierParams +) extends Bundle { + val barrierId = UInt(param.barrierIdBits.W) + val sizeMinusOne = UInt(param.numCoreBits.W) + val coreId = UInt(param.numCoreBits.W) +} + +class BarrierResponseBits( + param: BarrierParams +) extends Bundle { + val barrierId = UInt(param.barrierIdBits.W) +} + +class BarrierBundle(param: BarrierParams) extends Bundle { + val req = Decoupled(new BarrierRequestBits(param)) + val resp = Flipped(Decoupled(new BarrierResponseBits(param))) +} + +// FIXME Separate BarrierEdgeParams from BarrierParams +object BarrierNodeImp extends SimpleNodeImp[BarrierParams, EmptyParams, BarrierParams, BarrierBundle] { + def edge(pd: BarrierParams, pu: EmptyParams, p: Parameters, sourceInfo: SourceInfo) = { + // barrier parameters flow strictly downward from the master node + pd + } + def bundle(e: BarrierParams) = new BarrierBundle(e) + // FIXME render + def render(e: BarrierParams) = RenderedEdge(colour = "ffffff", label = "X") +} + +case class BarrierMasterNode(val srcParams: BarrierParams)(implicit valName: ValName) + extends SourceNode(BarrierNodeImp)(Seq(srcParams)) +case class BarrierSlaveNode(val numEdges: Int)(implicit valName: ValName) + extends SinkNode(BarrierNodeImp)(Seq.fill(numEdges)(EmptyParams())) + +class BarrierSynchronizer(param: BarrierParams) extends Module { + val numBarrierIds = 1 << param.barrierIdBits + val numCores = 1 << param.numCoreBits + println(s"numBarrierIds: ${numBarrierIds}, numCores: ${numCores}") + + val io = IO(new Bundle { + val reqs = Vec(numCores, Flipped(Decoupled(new BarrierRequestBits(param)))) + val resp = Decoupled(new BarrierResponseBits(param)) + }) + + // 2-dimensional table of per-id, per-core "done" signals + val table = RegInit(VecInit(Seq.fill(numBarrierIds)(VecInit(Seq.fill(numCores)(false.B))))) + val done = Wire(Vec(numBarrierIds, Bool())) + table.zipWithIndex.foreach { case (row, i) => + done(i) := row.reduce(_ && _) + } + dontTouch(done) + + io.reqs.zipWithIndex.foreach { case (req, coreId) => + // always ready; all this module does is latch to boolean regs + req.ready := true.B + when(req.fire) { + assert(coreId.U === req.bits.coreId) + // FIXME: don't need coreId to be hardware here + table(req.bits.barrierId)(coreId.U) := true.B + } + } + + val doneArbiter = Module(new RRArbiter(Bool(), numBarrierIds)) + (doneArbiter.io.in zip done).zipWithIndex.foreach { case ((in, d), i) => + in.valid := d + in.bits := d + when(in.fire) { + table(i).foreach(_ := false.B) + } + } + io.resp.valid := doneArbiter.io.out.valid + io.resp.bits.barrierId := doneArbiter.io.chosen + when(io.resp.fire) { + table(io.resp.bits.barrierId).foreach(_ := false.B) + } + doneArbiter.io.out.ready := io.resp.ready +} diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 2d67a24..7d6752c 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -7,10 +7,10 @@ import chisel3._ import chisel3.experimental.SourceInfo import chisel3.util._ -import org.chipsalliance.cde.config.{Field, Parameters} +import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.subsystem._ import freechips.rocketchip.tilelink._ -import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.diplomacy.{LazyModule, AddressSet, SimpleDevice, ClockCrossingType} import freechips.rocketchip.regmapper.RegField import freechips.rocketchip.prci.ClockSinkParameters @@ -98,20 +98,28 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}") } - outer.barrierSlaveNode.in.foreach { case (b, e) => - b.req.ready := true.B // barrier module is always ready - b.resp.valid := 0.U - b.resp.bits.barrierId := 0.U + val numBarriers = 4 // FIXME: hardcoded + + // @cleanup: This assumes barrier params on all edges are the same, i.e. all + // cores are configured to have the same barrier id range. While true, might + // be better to actually assert this + val barrierParam = outer.barrierSlaveNode.in(0)._2 + val synchronizer = Module(new BarrierSynchronizer(barrierParam)) + (synchronizer.io.reqs zip outer.barrierSlaveNode.in).foreach { case (req, (b, _)) => + req <> b.req + b.resp <> synchronizer.io.resp // broadcast } - auto.elements.foreach({case (name, _) => - println(s"======= RadianceCluster.elements.name: ${name}") - }) + // outer.barrierSlaveNode.in.foreach { case (b, e) => + // val fakeBarrierRespId = RegNext(b.req.bits.barrierId) + // val fakeBarrierRespValid = RegNext(b.req.fire) + // b.req.ready := true.B // barrier module is always ready + // b.resp.valid := fakeBarrierRespValid + // b.resp.bits.barrierId := fakeBarrierRespId + // } - val numCores = outer.leafTiles.size - val numBarriers = 4 // FIXME: hardcoded val allSyncedRegs = Seq.fill(numBarriers)(Wire(UInt(32.W))) - val perCoreSyncedRegs = Seq.fill(numBarriers)(Seq.fill(numCores)(RegInit(0.U(32.W)))) + val perCoreSyncedRegs = Seq.fill(numBarriers)(Seq.fill(outer.numCores)(RegInit(0.U(32.W)))) (allSyncedRegs zip perCoreSyncedRegs).foreach{ case (all, per) => all := per.reduce((x0, x1) => (x0 =/= 0.U) && (x1 =/= 0.U)) @@ -140,45 +148,3 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( println(s"======== barrierSlaveNode: ${outer.barrierSlaveNode.in(0)._2.barrierIdBits}") } - -case class EmptyParams() - -case class BarrierParams( - barrierIdBits: Int, - numCoreBits: Int -) - -class BarrierRequestBits( - param: BarrierParams -) extends Bundle { - val barrierId = UInt(param.barrierIdBits.W) - val sizeMinusOne = UInt(param.numCoreBits.W) - val coreId = UInt(param.numCoreBits.W) -} - -class BarrierResponseBits( - param: BarrierParams -) extends Bundle { - val barrierId = UInt(param.barrierIdBits.W) -} - -class BarrierBundle(param: BarrierParams) extends Bundle { - val req = Decoupled(new BarrierRequestBits(param)) - val resp = Flipped(Decoupled(new BarrierResponseBits(param))) -} - -// FIXME Separate BarrierEdgeParams from BarrierParams -object BarrierNodeImp extends SimpleNodeImp[BarrierParams, EmptyParams, BarrierParams, BarrierBundle] { - def edge(pd: BarrierParams, pu: EmptyParams, p: Parameters, sourceInfo: SourceInfo) = { - // barrier parameters flow strictly downward from the master node - pd - } - def bundle(e: BarrierParams) = new BarrierBundle(e) - // FIXME render - def render(e: BarrierParams) = RenderedEdge(colour = "ffffff", label = "X") -} - -case class BarrierMasterNode(val srcParams: BarrierParams)(implicit valName: ValName) - extends SourceNode(BarrierNodeImp)(Seq(srcParams)) -case class BarrierSlaveNode(val numEdges: Int)(implicit valName: ValName) - extends SinkNode(BarrierNodeImp)(Seq.fill(numEdges)(EmptyParams())) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index c9fe609..bf4dfa3 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -701,8 +701,11 @@ class RadianceTileModuleImp(outer: RadianceTile) outer.barrierMasterNode.out(0)._1.req.bits.barrierId := core.io.gbar_req_id outer.barrierMasterNode.out(0)._1.req.bits.coreId := core.io.gbar_req_core_id core.io.gbar_req_ready := outer.barrierMasterNode.out(0)._1.req.ready + core.io.gbar_rsp_valid := outer.barrierMasterNode.out(0)._1.resp.valid core.io.gbar_rsp_id := outer.barrierMasterNode.out(0)._1.resp.bits.barrierId + // core doesn't have a resp.ready port + outer.barrierMasterNode.out(0)._1.resp.ready := true.B } def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]], diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index 7a37c24..7a06f7d 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -90,16 +90,15 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W)) // FIXME: hardcoded - val numCoresPerCluster = 2 val NB_WIDTH = 2 val NC_WIDTH = 1 - val gbar_req_valid = Output(UInt((numCoresPerCluster * 1).W)) - val gbar_req_id = Output(UInt((numCoresPerCluster * NB_WIDTH).W)) - val gbar_req_size_m1 = Output(UInt((numCoresPerCluster * NC_WIDTH).W)) - val gbar_req_core_id = Output(UInt((numCoresPerCluster * NC_WIDTH).W)) - val gbar_req_ready = Input(UInt((numCoresPerCluster * 1).W)) - val gbar_rsp_valid = Input(UInt((numCoresPerCluster * 1).W)) - val gbar_rsp_id = Input(UInt((numCoresPerCluster * NB_WIDTH).W)) + val gbar_req_valid = Output(Bool()) + val gbar_req_id = Output(UInt(NB_WIDTH.W)) + val gbar_req_size_m1 = Output(UInt(NC_WIDTH.W)) + val gbar_req_core_id = Output(UInt(NC_WIDTH.W)) + val gbar_req_ready = Input(Bool()) + val gbar_rsp_valid = Input(Bool()) + val gbar_rsp_id = Input(UInt(NB_WIDTH.W)) // val fpu = Flipped(new FPUCoreIO()) //val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs)) From 785dcc9df72bb85d2d001ddc9eff5d2c1e991dbd Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 18 Mar 2024 14:32:06 -0700 Subject: [PATCH 23/23] Bump vortex --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 40e2888..df4b215 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 40e288873376c325d6680b94c7f2a184e8fcb8ea +Subproject commit df4b21507eae6fe6ee2003e7a6b79e0a7826eac4