diff --git a/radiance.mk b/radiance.mk index b2b61ed..0f0e29a 100644 --- a/radiance.mk +++ b/radiance.mk @@ -16,7 +16,10 @@ EXTRA_SIM_PREPROC_DEFINES += \ +define+SV_DPI \ +define+GPR_RESET \ +define+LSU_DUP_DISABLE \ - +define+DBG_TRACE_CORE_PIPELINE_VCS + +define+DBG_TRACE_CORE_PIPELINE_VCS \ + +define+PERF_ENABLE \ + +define+ICACHE_DISABLE +define+DCACHE_DISABLE \ + +define+NUM_THREADS=8 +define+NUM_WARPS=8 # cargo handles building of Rust files all on its own, so make this a PHONY # target to run cargo unconditionally diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index eb63767..df4b215 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit eb63767051779ddb0827746ac03287b009af2a5c +Subproject commit df4b21507eae6fe6ee2003e7a6b79e0a7826eac4 diff --git a/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala b/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala index ae803d5..701a274 100644 --- a/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala +++ b/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala @@ -17,10 +17,10 @@ trait CanHaveMemtraceCore { this: BaseSubsystem => // Safe to use get as WithMemtraceCore requires WithNLanes to be defined val simtParam = p(SIMTCoreKey).get val config = DefaultCoalescerConfig.copy( - numLanes = simtParam.nLanes, + numLanes = simtParam.nMemLanes, numOldSrcIds = simtParam.nSrcIds ) - val numLanes = simtParam.nLanes + val numLanes = simtParam.nMemLanes val filename = param.tracefilename // Need to explicitly generate clock domain; see rocket-chip 8881ccd diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index cc5c40e..338c36c 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -12,8 +12,13 @@ import freechips.rocketchip.tilelink._ // TODO: find better place for these -// Note: numNewSrcId is not a part of CoreParam, because the SIMT core should be agnostic to how inflight coalesced request can be genertated -case class SIMTCoreParams(nLanes: Int = 4, nSrcIds: Int = 8) +case class SIMTCoreParams( + nWarps: Int = 4, // # of warps in the core + nCoreLanes: Int = 4, // # of SIMT threads in the core + nMemLanes: Int = 4, // # of memory lanes in the memory interface to the + // cache; relates to the LSU lanes + nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes +) case class MemtraceCoreParams( tracefilename: String = "undefined", traceHasSource: Boolean = false @@ -2325,7 +2330,7 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig) // A dummy harness around the coalescer for use in VLSI flow. // Should not instantiate any memtrace modules. class DummyCoalescer(implicit p: Parameters) extends LazyModule { - val numLanes = p(SIMTCoreKey).get.nLanes + val numLanes = p(SIMTCoreKey).get.nMemLanes val config = DefaultCoalescerConfig.copy(numLanes = numLanes) val driver = LazyModule(new DummyDriver(config)) @@ -2362,7 +2367,7 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) // tracedriver --> coalescer --> tracelogger --> tlram class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { - val numLanes = p(SIMTCoreKey).get.nLanes + val numLanes = p(SIMTCoreKey).get.nMemLanes val config = DefaultCoalescerConfig.copy(numLanes = numLanes) val driver = LazyModule(new MemTraceDriver(config, filename)) @@ -2454,7 +2459,7 @@ class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit // tracedriver --> coalescer --> tlram class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { - val numLanes = p(SIMTCoreKey).get.nLanes + val numLanes = p(SIMTCoreKey).get.nMemLanes val config = DefaultCoalescerConfig.copy(numLanes = numLanes) val filename = "vecadd.core1.thread4.trace" diff --git a/src/main/scala/radiance/memory/UnitTest.scala b/src/main/scala/radiance/memory/UnitTest.scala index 24ea69d..c070ef4 100644 --- a/src/main/scala/radiance/memory/UnitTest.scala +++ b/src/main/scala/radiance/memory/UnitTest.scala @@ -8,7 +8,7 @@ import freechips.rocketchip.subsystem.{BaseSubsystemConfig} import freechips.rocketchip.devices.tilelink._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.util._ -import radiance.subsystem.WithSimtLanes +import radiance.subsystem.WithSimtConfig import freechips.rocketchip.unittest._ //import rocket.VortexFatBankTest @@ -27,7 +27,7 @@ class WithCoalescingUnitTests extends Config((site, _, _) => { // Module(new TLRAMCoalescerLoggerTest(filename="sfilter.core1.thread4.trace", timeout=timeout)), // Module(new TLRAMCoalescerLoggerTest(filename="nearn.core1.thread4.trace", timeout=50000000 * site(TestDurationMultiplier))), // Module(new TLRAMCoalescerLoggerTest(filename="psort.core1.thread4.trace", timeout=timeout)), - // Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.trace", timeout=timeout)(new WithSimtLanes(32))), + // Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.trace", timeout=timeout)(new WithSimtConfig(32))), // Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.lane4.trace", timeout=timeout)), ) } }) @@ -48,12 +48,12 @@ class WithCoalescingUnitSynthesisDummy(nLanes: Int) extends Config((site, _, _) implicit val p = q val timeout = 50000 * site(TestDurationMultiplier) Seq( - Module(new DummyCoalescerTest(timeout=timeout)(new WithSimtLanes(nLanes=4))), + Module(new DummyCoalescerTest(timeout=timeout)(new WithSimtConfig(nMemLanes=4))), ) } }) -class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtLanes(nLanes=4) ++ new BaseSubsystemConfig) -//class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtLanes(nLanes=4) ++ new BaseSubsystemConfig) +class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nMemLanes=4) ++ new BaseSubsystemConfig) +//class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nLanes=4) ++ new BaseSubsystemConfig) // Dummy configs of various sizes for synthesis class CoalescingSynthesisDummyLane4Config extends Config(new WithCoalescingUnitSynthesisDummy(4) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig) diff --git a/src/main/scala/radiance/memory/VortexCache.scala b/src/main/scala/radiance/memory/VortexCache.scala index e3015ce..61a841c 100644 --- a/src/main/scala/radiance/memory/VortexCache.scala +++ b/src/main/scala/radiance/memory/VortexCache.scala @@ -10,6 +10,7 @@ import org.chipsalliance.cde.config.{Parameters, Field} case object VortexL1Key extends Field[Option[VortexL1Config]](None /*default*/ ) case class VortexL1Config( + cacheSize: Int, // total cache size in bytes numBanks: Int, wordSize: Int, // This is the read/write granularity of the L1 cache cacheLineSize: Int, @@ -34,6 +35,7 @@ case class VortexL1Config( object defaultVortexL1Config extends VortexL1Config( + cacheSize = 16384, numBanks = 4, wordSize = 16, cacheLineSize = 16, @@ -98,7 +100,7 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters) TLMasterPortParameters.v1( clients = Seq( TLMasterParameters.v1( - name = "VortexBank", + name = "VortexBankPassthrough", sourceId = IdRange( 0, 1 << (log2Ceil( @@ -173,7 +175,7 @@ class VortexBank( TLMasterPortParameters.v1( clients = Seq( TLMasterParameters.v1( - name = "VortexBank", + name = s"VortexBank${bankId}", sourceId = IdRange(0, config.memSideSourceIds), supportsProbe = TransferSizes(1, config.wordSize), supportsGet = TransferSizes(1, config.wordSize), @@ -203,6 +205,8 @@ class VortexBankImp( val vxCache = Module( new VX_cache_top( WORD_SIZE = config.wordSize, + // distribute total size across numBanks + CACHE_SIZE = config.cacheSize / config.numBanks, CACHE_LINE_SIZE = config.cacheLineSize, CORE_TAG_WIDTH = config.coreTagPlusSizeWidth, MSHR_SIZE = config.mshrSize @@ -389,7 +393,7 @@ class VortexBankImp( class VX_cache_top( // these values should match the default settings in Verilog // TODO: INSTANCE_ID - CACHE_SIZE: Int = 16384 / 4, // 1, "CACHE_SIZE" -> CACHE_SIZE, "LINE_SIZE" -> CACHE_LINE_SIZE, @@ -606,7 +610,7 @@ class NewSourceGenerator[T <: Data]( oldestMetadata := occupancyTable(oldestIndex).meta oldestAge := occupancyTable(oldestIndex).age assert( - oldestAge <= 2000.U, + oldestAge <= 10000.U, "One id in the SourceGen is not released for long time, potential bug !" ) diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 77b0711..95e6318 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -13,10 +13,12 @@ import radiance.memory._ class WithRadianceCores( n: Int, + location: HierarchicalLocation, + crossing: RocketCrossingParams, useVxCache: Boolean ) extends Config((site, _, up) => { - case TilesLocated(InSubsystem) => { - val prev = up(TilesLocated(InSubsystem), site) + case TilesLocated(`location`) => { + val prev = up(TilesLocated(`location`), site) val idOffset = prev.size val vortex = RadianceTileParams( core = VortexCoreParams(fpu = None), @@ -43,10 +45,19 @@ class WithRadianceCores( blockBytes = site(CacheBlockBytes)))) List.tabulate(n)(i => RadianceTileAttachParams( vortex.copy(tileId = i + idOffset), - RocketCrossingParams() + crossing )) ++ prev } -}) +}) { + def this(n: Int, location: HierarchicalLocation = InSubsystem, useVxCache: Boolean = false) = this(n, location, RocketCrossingParams( + master = HierarchicalElementMasterPortParams.locationDefault(location), + slave = HierarchicalElementSlavePortParams.locationDefault(location), + mmioBaseAddressPrefixWhere = location match { + case InSubsystem => CBUS + case InCluster(clusterId) => CCBUS(clusterId) + } + ), useVxCache) +} class WithFuzzerCores( n: Int, @@ -65,11 +76,33 @@ class WithFuzzerCores( } }) +class WithRadianceCluster( + clusterId: Int, + location: HierarchicalLocation = InSubsystem, + crossing: RocketCrossingParams = RocketCrossingParams() // TODO make this not rocket +) extends Config((site, here, up) => { + case ClustersLocated(`location`) => up(ClustersLocated(location)) :+ RadianceClusterAttachParams( + RadianceClusterParams(clusterId = clusterId), + crossing) + case TLNetworkTopologyLocated(InCluster(`clusterId`)) => List( + ClusterBusTopologyParams( + clusterId = clusterId, + csbus = site(SystemBusKey), + ccbus = site(ControlBusKey).copy(errorDevice = None), + coherence = site(ClusterBankedCoherenceKey(clusterId)) + ) + ) + case PossibleTileLocations => up(PossibleTileLocations) :+ InCluster(clusterId) +}) + // `nSrcIds`: number of source IDs for dmem requests on each SIMT lane -class WithSimtLanes(nLanes: Int, nSrcIds: Int = 8) extends Config((site, _, up) => { +class WithSimtConfig(nWarps: Int = 4, nCoreLanes: Int = 4, nMemLanes: Int = 4, nSrcIds: Int = 8) +extends Config((site, _, up) => { case SIMTCoreKey => { Some(up(SIMTCoreKey, site).getOrElse(SIMTCoreParams()).copy( - nLanes = nLanes, + nWarps = nWarps, + nCoreLanes = nCoreLanes, + nMemLanes = nMemLanes, nSrcIds = nSrcIds )) } @@ -105,7 +138,7 @@ class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => { class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config((site, _, up) => { case CoalescerKey => { val (nLanes, numOldSrcIds) = up(SIMTCoreKey, site) match { - case Some(param) => (param.nLanes, param.nSrcIds) + case Some(param) => (param.nMemLanes, param.nSrcIds) case None => (1,1) } @@ -182,4 +215,4 @@ class WithExtGPUMem(address: BigInt = BigInt("0x100000000", 16), }) }) case class GPUMemParams(address: BigInt = BigInt("0x100000000", 16), size: BigInt = 0x80000000) -case class GPUMemory() extends Field[Option[GPUMemParams]](None) \ No newline at end of file +case class GPUMemory() extends Field[Option[GPUMemParams]](None) diff --git a/src/main/scala/radiance/subsystem/RadianceSubsystem.scala b/src/main/scala/radiance/subsystem/RadianceSubsystem.scala index f9fb0bf..8979fb5 100644 --- a/src/main/scala/radiance/subsystem/RadianceSubsystem.scala +++ b/src/main/scala/radiance/subsystem/RadianceSubsystem.scala @@ -9,3 +9,10 @@ case class RadianceTileAttachParams( tileParams: RadianceTileParams, crossingParams: RocketCrossingParams ) extends CanAttachTile { type TileType = RadianceTile } + +case class RadianceClusterAttachParams ( + clusterParams: RadianceClusterParams, + crossingParams: HierarchicalElementCrossingParamsLike +) extends CanAttachCluster { + type ClusterType = RadianceCluster +} diff --git a/src/main/scala/radiance/tile/Barrier.scala b/src/main/scala/radiance/tile/Barrier.scala new file mode 100644 index 0000000..e3e965b --- /dev/null +++ b/src/main/scala/radiance/tile/Barrier.scala @@ -0,0 +1,98 @@ +// See LICENSE.SiFive for license details. +// See LICENSE.Berkeley for license details. + +package radiance.tile + +import chisel3._ +import chisel3.experimental.SourceInfo +import chisel3.util._ + +import org.chipsalliance.cde.config.{Field, Parameters} +import freechips.rocketchip.subsystem._ +import freechips.rocketchip.diplomacy._ + +case class EmptyParams() + +case class BarrierParams( + barrierIdBits: Int, + numCoreBits: Int +) + +class BarrierRequestBits( + param: BarrierParams +) extends Bundle { + val barrierId = UInt(param.barrierIdBits.W) + val sizeMinusOne = UInt(param.numCoreBits.W) + val coreId = UInt(param.numCoreBits.W) +} + +class BarrierResponseBits( + param: BarrierParams +) extends Bundle { + val barrierId = UInt(param.barrierIdBits.W) +} + +class BarrierBundle(param: BarrierParams) extends Bundle { + val req = Decoupled(new BarrierRequestBits(param)) + val resp = Flipped(Decoupled(new BarrierResponseBits(param))) +} + +// FIXME Separate BarrierEdgeParams from BarrierParams +object BarrierNodeImp extends SimpleNodeImp[BarrierParams, EmptyParams, BarrierParams, BarrierBundle] { + def edge(pd: BarrierParams, pu: EmptyParams, p: Parameters, sourceInfo: SourceInfo) = { + // barrier parameters flow strictly downward from the master node + pd + } + def bundle(e: BarrierParams) = new BarrierBundle(e) + // FIXME render + def render(e: BarrierParams) = RenderedEdge(colour = "ffffff", label = "X") +} + +case class BarrierMasterNode(val srcParams: BarrierParams)(implicit valName: ValName) + extends SourceNode(BarrierNodeImp)(Seq(srcParams)) +case class BarrierSlaveNode(val numEdges: Int)(implicit valName: ValName) + extends SinkNode(BarrierNodeImp)(Seq.fill(numEdges)(EmptyParams())) + +class BarrierSynchronizer(param: BarrierParams) extends Module { + val numBarrierIds = 1 << param.barrierIdBits + val numCores = 1 << param.numCoreBits + println(s"numBarrierIds: ${numBarrierIds}, numCores: ${numCores}") + + val io = IO(new Bundle { + val reqs = Vec(numCores, Flipped(Decoupled(new BarrierRequestBits(param)))) + val resp = Decoupled(new BarrierResponseBits(param)) + }) + + // 2-dimensional table of per-id, per-core "done" signals + val table = RegInit(VecInit(Seq.fill(numBarrierIds)(VecInit(Seq.fill(numCores)(false.B))))) + val done = Wire(Vec(numBarrierIds, Bool())) + table.zipWithIndex.foreach { case (row, i) => + done(i) := row.reduce(_ && _) + } + dontTouch(done) + + io.reqs.zipWithIndex.foreach { case (req, coreId) => + // always ready; all this module does is latch to boolean regs + req.ready := true.B + when(req.fire) { + assert(coreId.U === req.bits.coreId) + // FIXME: don't need coreId to be hardware here + table(req.bits.barrierId)(coreId.U) := true.B + } + } + + val doneArbiter = Module(new RRArbiter(Bool(), numBarrierIds)) + (doneArbiter.io.in zip done).zipWithIndex.foreach { case ((in, d), i) => + in.valid := d + in.bits := d + when(in.fire) { + table(i).foreach(_ := false.B) + } + } + io.resp.valid := doneArbiter.io.out.valid + io.resp.bits.barrierId := doneArbiter.io.chosen + when(io.resp.fire) { + table(io.resp.bits.barrierId).foreach(_ := false.B) + } + doneArbiter.io.out.ready := io.resp.ready +} diff --git a/src/main/scala/radiance/tile/FuzzerTile.scala b/src/main/scala/radiance/tile/FuzzerTile.scala index e76342e..c139744 100644 --- a/src/main/scala/radiance/tile/FuzzerTile.scala +++ b/src/main/scala/radiance/tile/FuzzerTile.scala @@ -60,7 +60,7 @@ class FuzzerTile private ( // val statusNode = BundleBridgeSource(() => new GroundTestStatus) val (numLanes, numSrcIds) = p(SIMTCoreKey) match { - case Some(param) => (param.nLanes, param.nSrcIds) + case Some(param) => (param.nMemLanes, param.nSrcIds) case None => { require(false, "fuzzer requires SIMTCoreKey to be defined") (0, 0) diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala new file mode 100644 index 0000000..7d6752c --- /dev/null +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -0,0 +1,150 @@ +// See LICENSE.SiFive for license details. +// See LICENSE.Berkeley for license details. + +package radiance.tile + +import chisel3._ +import chisel3.experimental.SourceInfo +import chisel3.util._ + +import org.chipsalliance.cde.config.Parameters +import freechips.rocketchip.subsystem._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.diplomacy.{LazyModule, AddressSet, SimpleDevice, ClockCrossingType} +import freechips.rocketchip.regmapper.RegField +import freechips.rocketchip.prci.ClockSinkParameters + +case class RadianceClusterParams( + val clusterId: Int, + val clockSinkParams: ClockSinkParameters = ClockSinkParameters() +) extends InstantiableClusterParams[RadianceCluster] { + val baseName = "radiance_cluster" + val uniqueName = s"${baseName}_$clusterId" + def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByClusterIdImpl)(implicit p: Parameters): RadianceCluster = { + new RadianceCluster(this, crossing.crossingType, lookup) + } +} + +class RadianceCluster ( + thisClusterParams: RadianceClusterParams, + crossing: ClockCrossingType, + lookup: LookupByClusterIdImpl +)(implicit p: Parameters) extends Cluster(thisClusterParams, crossing, lookup) { + // cluster-local bus, used for shared memory traffic that never leaves the + // confines of a cluster + val clbus = tlBusWrapperLocationMap(CLBUS(clusterId)) + + clbus.clockGroupNode := allClockGroupsNode + + // Instantiate cluster-local shared memory scratchpad + // + // Instantiate the same number of banks as there are lanes. + val numLsuLanes = 4 // FIXME: hardcoded + val wordSize = 4 + val smemBanks = Seq.tabulate(numLsuLanes) { bankId => + // Banked-by-word (4 bytes) + // base for bank 1: ff...000000|01|00 + // mask for bank 1; 00...111111|00|11 + val base = 0xff000000L | (bankId * wordSize) + val mask = 0x00001fffL ^ ((numLsuLanes - 1) * wordSize) + LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = wordSize)) + } + smemBanks.foreach(_.node := clbus.outwardNode) + + val numCores = leafTiles.size + + // Diplomacy sink nodes for cluster-wide barrier sync signal + val barrierSlaveNode = BarrierSlaveNode(numCores) + + // HACK: This is a workaround of the CanAttachTile bus connecting API that + // works by downcasting tile and directly accessing the node inside that is + // not exposed as a master in HierarchicalElementCrossingParamsLike. + // val tile = leafTiles(0).asInstanceOf[RadianceTile] + // val perSmemPortXbars = Seq.fill(tile.smemNodes.size) { LazyModule(new TLXbar) } + + // Tie corresponding smem ports from every tile into a single port using + // Xbars so that the number of ports going into the sharedmem do not scale + // with the number of tiles. + leafTiles.foreach { case (id, tile: RadianceTile) => + // (perSmemPortXbars zip tile.smemNodes).foreach { + // case (xbar, node) => xbar.node := node + // } + tile.smemNodes.foreach(clbus.inwardNode := _) + barrierSlaveNode := tile.barrierMasterNode + } + // perSmemPortXbars.foreach { clbus.inwardNode := _.node } + + // Memory-mapped register for barrier sync + val regDevice = new SimpleDevice("radiance-cluster-barrier-reg", + Seq(s"radiance-cluster-barrier-reg${clusterId}")) + val regNode = TLRegisterNode( + address = Seq(AddressSet(0xff003f00L, 0xff)), + device = regDevice, + beatBytes = wordSize, + concurrency = 1) + regNode := clbus.outwardNode + + nodes.foreach({ node => + println(s"======= RadianceCluster node.name: ${node.name}") + }) + + override lazy val module = new RadianceClusterModuleImp(this) +} + +class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp(outer) { + outer.leafTiles.foreach { case (id, tile: RadianceTile) => + // println(s"======= RadianceCluster: tile.smemXbar.node.edge = ${tile.smemXbar.node.out.size}") + println(s"======= RadianceCluster: clbus inward edges = ${outer.clbus.inwardNode.inward.inputs.length}") + println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}") + } + + val numBarriers = 4 // FIXME: hardcoded + + // @cleanup: This assumes barrier params on all edges are the same, i.e. all + // cores are configured to have the same barrier id range. While true, might + // be better to actually assert this + val barrierParam = outer.barrierSlaveNode.in(0)._2 + val synchronizer = Module(new BarrierSynchronizer(barrierParam)) + (synchronizer.io.reqs zip outer.barrierSlaveNode.in).foreach { case (req, (b, _)) => + req <> b.req + b.resp <> synchronizer.io.resp // broadcast + } + + // outer.barrierSlaveNode.in.foreach { case (b, e) => + // val fakeBarrierRespId = RegNext(b.req.bits.barrierId) + // val fakeBarrierRespValid = RegNext(b.req.fire) + // b.req.ready := true.B // barrier module is always ready + // b.resp.valid := fakeBarrierRespValid + // b.resp.bits.barrierId := fakeBarrierRespId + // } + + val allSyncedRegs = Seq.fill(numBarriers)(Wire(UInt(32.W))) + val perCoreSyncedRegs = Seq.fill(numBarriers)(Seq.fill(outer.numCores)(RegInit(0.U(32.W)))) + (allSyncedRegs zip perCoreSyncedRegs).foreach{ case (all, per) => + all := per.reduce((x0, x1) => (x0 =/= 0.U) && (x1 =/= 0.U)) + + val allPassed = per.map(_ === 2.U).reduce(_ && _) + when(allPassed) { + per.foreach(_ := 0.U) + } + + dontTouch(all) + } + // FIXME: 4 cores per cluster hardcoded + outer.regNode.regmap( + 0x00 -> Seq(RegField.r(32, allSyncedRegs(0))), + 0x04 -> Seq(RegField(32, perCoreSyncedRegs(0)(0))), + 0x08 -> Seq(RegField(32, perCoreSyncedRegs(0)(1))), + 0x10 -> Seq(RegField.r(32, allSyncedRegs(1))), + 0x14 -> Seq(RegField(32, perCoreSyncedRegs(1)(0))), + 0x18 -> Seq(RegField(32, perCoreSyncedRegs(1)(1))), + 0x20 -> Seq(RegField.r(32, allSyncedRegs(2))), + 0x24 -> Seq(RegField(32, perCoreSyncedRegs(2)(0))), + 0x28 -> Seq(RegField(32, perCoreSyncedRegs(2)(1))), + 0x30 -> Seq(RegField.r(32, allSyncedRegs(3))), + 0x34 -> Seq(RegField(32, perCoreSyncedRegs(3)(0))), + 0x38 -> Seq(RegField(32, perCoreSyncedRegs(3)(1))), + ) + + println(s"======== barrierSlaveNode: ${outer.barrierSlaveNode.in(0)._2.barrierIdBits}") +} diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index cbcf904..59c6fed 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -140,10 +140,21 @@ class RadianceTile private ( require( p(SIMTCoreKey).isDefined, - "SIMTCoreKey not defined; make sure to use WithSimtLanes when using RadianceTile" + "SIMTCoreKey not defined; make sure to use WithSimtConfig when using RadianceTile" ) - val numLanes = p(SIMTCoreKey) match { - case Some(simtParam) => simtParam.nLanes + + // NOTE: when changing these, remember to change +define+NUM_CORES/THREADS/WARPS in + // radiance.mk as well! + val numWarps = p(SIMTCoreKey) match { + case Some(simtParam) => simtParam.nWarps + case None => 4 + } + val numCoreLanes = p(SIMTCoreKey) match { + case Some(simtParam) => simtParam.nCoreLanes + case None => 4 + } + val numLsuLanes = p(SIMTCoreKey) match { + case Some(simtParam) => simtParam.nMemLanes case None => 4 } @@ -170,13 +181,14 @@ class RadianceTile private ( val smemSourceWidth = 4 // FIXME: hardcoded - val numWarps = 4 // TODO: parametrize + // Replicates some of the logic of how Vortex determines the tag width of + // memory requests so that Chisel and Verilog are in agreement on bitwidths. + // See VX_gpu_pkg.sv val NW_WIDTH = (if (numWarps == 1) 1 else log2Ceil(numWarps)) val UUID_WIDTH = 44 val imemTagWidth = UUID_WIDTH + NW_WIDTH - val numLsuLanes = 4 - // see VX_gpu_pkg.sv - val LSUQ_SIZE = 8 * (numLanes / numLsuLanes) + + val LSUQ_SIZE = 8 * (numCoreLanes / numLsuLanes) val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/ val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS // dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH @@ -291,15 +303,13 @@ class RadianceTile private ( // Conditionally instantiate L1 cache val (icacheNode, dcacheNode): (TLNode, TLNode) = p(VortexL1Key) match { case Some(vortexL1Config) => { - println( - s"============ Using Vortex L1 cache =================" - ) + println("VortexL1Cache instantiated") // require( // p(CoalescerKey).isDefined, // "Vortex L1 configuration currently only works when coalescer is also enabled." // ) - val icache = LazyModule(new VortexL1Cache(vortexL1Config)) + val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy(numBanks = 1))) val dcache = LazyModule(new VortexL1Cache(vortexL1Config)) // imemNodes.foreach { icache.coresideNode := TLWidthWidget(4) := _ } assert(imemNodes.length == 1) // FIXME @@ -316,22 +326,10 @@ class RadianceTile private ( } } - // Instantiate sharedmem banks - // - // Instantiate the same number of banks as there are lanes. - // TODO: parametrize - // val smemBanks = Seq.tabulate(numLsuLanes) { bankId => - // // Banked-by-word (4 bytes) - // // base for bank 1: ff...000000|01|00 - // // mask for bank 1; 00...111111|00|11 - // val base = 0xff000000L | (bankId * 4 /*wordSize*/ ) - // val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ ) - // LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = 4 /*wordSize*/ )) - // } - // smem lanes-to-banks crossbar - val smemXbar = LazyModule(new TLXbar) - smemNodes.foreach(smemXbar.node := _) - // smemBanks.foreach(_.node := smemXbar.node) + // Barrier synchronization node + // FIXME: hardcoded params + val barrierParams = BarrierParams(barrierIdBits = 2, numCoreBits = 1) + val barrierMasterNode = BarrierMasterNode(barrierParams) val base = p(GPUMemory()) match { case Some(GPUMemParams(baseAddr, _)) => baseAddr @@ -346,7 +344,6 @@ class RadianceTile private ( tlMasterXbar.node :=* AddressOrNode(base) :=* dcacheNode } - // ROCC // TODO: parametrize val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy( @@ -371,14 +368,14 @@ class RadianceTile private ( tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode // MMIO - gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node + // gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node // sharedmem access // // FIXME: gemmini spad has 16B data width; core smem interface has 4B. Need // to consolidate by either coalescing, or changing gemmini spad to // strided-by-word - gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node - TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node + // gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node + // TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node /* below are copied from rocket */ @@ -462,6 +459,10 @@ class RadianceTileModuleImp(outer: RadianceTile) extends BaseTileModuleImp(outer) { Annotated.params(this, outer.radianceParams) + auto.elements.foreach({case (name, _) => + println(s"======= RadianceTile.elements.name: ${name}") + }) + val core = Module(new Vortex(outer)(outer.p)) core.io.clock := clock @@ -532,6 +533,11 @@ class RadianceTileModuleImp(outer: RadianceTile) // TODO: make imemNodes not a vector imemTLAdapter.io.inReq <> core.io.imem.get(0).a core.io.imem.get(0).d <> imemTLAdapter.io.inResp + + performanceCounters(Seq(imemTLAdapter.io.inReq), Seq(imemTLAdapter.io.inResp), + desc = s"core${outer.tileId}-imem") + + // now connect TL adapter downstream ports to the tile egress ports outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d } @@ -629,6 +635,10 @@ class RadianceTileModuleImp(outer: RadianceTile) } core.io.dmem_d_valid := dmem_d_valid_vec.asUInt + performanceCounters(dmemTLAdapters.map(_.io.inReq), dmemTLAdapters.map(_.io.inResp), + desc = s"core${outer.tileId}-dmem") + + // now connect TL adapter downstream ports to the tile egress ports (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) => tlOut.a <> tlAdapter.io.outReq tlAdapter.io.outResp <> tlOut.d @@ -678,47 +688,114 @@ class RadianceTileModuleImp(outer: RadianceTile) tlAdapter.io.inResp.ready := core.io.smem_d_ready(i) } + performanceCounters(smemTLAdapters.map(_.io.inReq), smemTLAdapters.map(_.io.inResp), + desc = s"core${outer.tileId}-smem") + + // now connect TL adapter downstream ports to the tile egress ports (smemTLAdapters zip smemTLBundles) foreach { case (tlAdapter, tlOut) => tlOut.a <> tlAdapter.io.outReq tlAdapter.io.outResp <> tlOut.d } } + def connectBarrier = { + require(outer.barrierMasterNode.out.length == 1) + // FIXME: bits not flattened + outer.barrierMasterNode.out(0)._1.req.valid := core.io.gbar_req_valid + outer.barrierMasterNode.out(0)._1.req.bits.barrierId := core.io.gbar_req_id + outer.barrierMasterNode.out(0)._1.req.bits.coreId := core.io.gbar_req_core_id + core.io.gbar_req_ready := outer.barrierMasterNode.out(0)._1.req.ready + + core.io.gbar_rsp_valid := outer.barrierMasterNode.out(0)._1.resp.valid + core.io.gbar_rsp_id := outer.barrierMasterNode.out(0)._1.resp.bits.barrierId + // core doesn't have a resp.ready port + outer.barrierMasterNode.out(0)._1.resp.ready := true.B + } + + def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]], + respBundles: Seq[DecoupledIO[VortexBundleD]], + desc: String) = { + val currentPendingReqs = RegInit(SInt(32.W), 0.S) + val pendingReqsCumulative = RegInit(SInt(32.W), 0.S) + val totalReqs = RegInit(UInt(32.W), 0.U) + + val reqFireCountPerCycle = Wire(UInt(32.W)) + val respFireCountPerCycle = Wire(UInt(32.W)) + val reqReadFires = reqBundles.map { b => b.fire && b.bits.opcode === 4.U /* Get */ } + val respReadFires = respBundles.map { b => b.fire && b.bits.opcode === 1.U /* AccessAckData */} + reqFireCountPerCycle := PopCount(reqReadFires) + respFireCountPerCycle := PopCount(respReadFires) + totalReqs := totalReqs + reqFireCountPerCycle + + val diffPendingReqs = reqFireCountPerCycle.asSInt - respFireCountPerCycle.asSInt + currentPendingReqs := currentPendingReqs + diffPendingReqs + pendingReqsCumulative := pendingReqsCumulative + currentPendingReqs + + val prevFinished = RegNext(core.io.finished) + val justFinished = !prevFinished && core.io.finished + when (justFinished) { + printf(s"PERF: ${desc}: average request latency (cum_pending / total): %d / %d\n", + pendingReqsCumulative, totalReqs) + } + + dontTouch(totalReqs) + dontTouch(diffPendingReqs) + dontTouch(currentPendingReqs) + dontTouch(pendingReqsCumulative) + } + connectImem connectDmem connectSmem + connectBarrier } // TODO: generalize for useVxCache if (!outer.radianceParams.useVxCache) {} - // RoCC - if (outer.roccs.size > 0) { - val (respArb, cmdRouter) = { - val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size)) - val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p)) - outer.roccs.zipWithIndex.foreach { case (rocc, i) => - // ptwPorts ++= rocc.module.io.ptw - rocc.module.io.ptw <> DontCare - rocc.module.io.mem <> DontCare - rocc.module.io.cmd <> cmdRouter.io.out(i) - respArb.io.in(i) <> Queue(rocc.module.io.resp) - } - // Create this FPU just for RoCC - // val nFPUPorts = outer.roccs.filter(_.usesFPU).size - val fp_rocc_ios = outer.roccs.map(_.module.io) - fp_rocc_ios.map { io => - io.fpu_req.ready := false.B - io.fpu_resp.valid := false.B - io.fpu_resp.bits := DontCare - } - (respArb, cmdRouter) - } + // // RoCC + // if (outer.roccs.size > 0) { + // val (respArb, cmdRouter) = { + // val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size)) + // val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p)) + // outer.roccs.zipWithIndex.foreach { case (rocc, i) => + // // ptwPorts ++= rocc.module.io.ptw + // rocc.module.io.ptw <> DontCare + // rocc.module.io.mem <> DontCare + // rocc.module.io.cmd <> cmdRouter.io.out(i) + // respArb.io.in(i) <> Queue(rocc.module.io.resp) + // } + // // Create this FPU just for RoCC + // // val nFPUPorts = outer.roccs.filter(_.usesFPU).size + // val fp_rocc_ios = outer.roccs.map(_.module.io) + // fp_rocc_ios.map { io => + // io.fpu_req.ready := false.B + // io.fpu_resp.valid := false.B + // io.fpu_resp.bits := DontCare + // } + // (respArb, cmdRouter) + // } - cmdRouter.io.in <> DontCare - outer.roccs.foreach(_.module.io.exception := DontCare) - respArb.io.out <> DontCare - } + // cmdRouter.io.in <> DontCare + // outer.roccs.foreach(_.module.io.exception := DontCare) + // respArb.io.out <> DontCare + // } +} + +class ClusterSynchronizer( + barrierIdWidth: Int, + numCoreWidth: Int, +) extends Module { + val io = IO(new Bundle { + val req = Flipped(Decoupled(new Bundle { + val barrierId = UInt(barrierIdWidth.W) + val sizeMinusOne = UInt(numCoreWidth.W) + val coreId = UInt(numCoreWidth.W) + })) + val resp = Decoupled(new Bundle { + val barrierId = UInt(barrierIdWidth.W) + }) + }) } // Some @copypaste from CoalescerSourceGen. @@ -768,7 +845,6 @@ class VortexTLAdapter( io.outReq.bits.corrupt := 0.U io.inReq.ready := io.outReq.ready // VortexBundleD <> TLBundleD - // Filtering out write requests is handled inside the wrapper Verilog io.inResp.valid := io.outResp.valid io.inResp.bits.opcode := io.outResp.bits.opcode io.inResp.bits.size := io.outResp.bits.size diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index e468ee3..7a06f7d 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -41,18 +41,11 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val interrupts = Input(new freechips.rocketchip.rocket.CoreInterrupts(false/*hasBeu*/)) // conditionally instantiate ports depending on whether we want to use VX_cache or not + // TODO: flatten this like dmem and smem val imem = if (!tile.radianceParams.useVxCache) Some(Vec(1, new Bundle { val a = Decoupled(new VortexBundleA(tagWidth = tile.imemTagWidth, dataWidth = 32)) val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.imemTagWidth, dataWidth = 32))) })) else None - val dmem = if (!tile.radianceParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle { - // val a = Decoupled(new VortexBundleA(tagWidth = tile.dmemTagWidth, dataWidth = 32)) - // val d = Flipped(Decoupled(new VortexBundleD(tagWidth = dmemTagWidth, dataWidth = 32))) - })) else None - val smem = if (!tile.radianceParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle { - // val a = Decoupled(new VortexBundleA(tagWidth = tile.smemTagWidth, dataWidth = 32)) - // val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.smemTagWidth, dataWidth = 32))) - })) else None val mem = if (tile.radianceParams.useVxCache) Some(new Bundle { val a = Decoupled(new VortexBundleA(tagWidth = 15, dataWidth = 128)) val d = Flipped(Decoupled(new VortexBundleD(tagWidth = 15, dataWidth = 128))) @@ -96,6 +89,17 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val smem_d_bits_data = Input(UInt((tile.numLsuLanes * 32).W)) val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W)) + // FIXME: hardcoded + val NB_WIDTH = 2 + val NC_WIDTH = 1 + val gbar_req_valid = Output(Bool()) + val gbar_req_id = Output(UInt(NB_WIDTH.W)) + val gbar_req_size_m1 = Output(UInt(NC_WIDTH.W)) + val gbar_req_core_id = Output(UInt(NC_WIDTH.W)) + val gbar_req_ready = Input(Bool()) + val gbar_rsp_valid = Input(Bool()) + val gbar_rsp_id = Input(UInt(NB_WIDTH.W)) + // val fpu = Flipped(new FPUCoreIO()) //val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs)) //val trace = Output(new TraceBundle) @@ -112,6 +116,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // see VX_csr_data that implements the read logic for CSR_MHARTID/GWID. Map( "CORE_ID" -> tile.tileParams.tileId, + "CORES_PER_CLUSTER" -> 2, // FIXME: hardcoded // TODO: can we get this as a parameter? "BOOTROM_HANG100" -> 0x10100, "NUM_THREADS" -> tile.numLsuLanes @@ -194,10 +199,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_tags.sv") // addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_wrap.sv") - // gbar is only used in the socket/cluster hierarchy - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv") - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_bus_if.sv") - // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv") // mem_arb is used in VX_socket or VX_cache_cluster // addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_arb.sv") addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_bus_if.sv") @@ -217,6 +218,14 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_define.vh") // addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_wrap.sv") + // used when PERF_ENABLE is defined + addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_perf_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_pipeline_perf_if.sv") + // used when GBAR_ENABLE is defined + addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_bus_if.sv") + // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv") + // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv") @@ -245,6 +254,9 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_mux.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_pending_size.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_register.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_buffer.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_toggle_buffer.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_buffer.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_popcount.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_arbiter.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_encoder.sv")