Merge branch 'main' of https://github.com/ucb-bar/radiance into main

This commit is contained in:
Richard Yan
2024-03-20 16:42:54 -07:00
13 changed files with 489 additions and 101 deletions

View File

@@ -17,10 +17,10 @@ trait CanHaveMemtraceCore { this: BaseSubsystem =>
// Safe to use get as WithMemtraceCore requires WithNLanes to be defined
val simtParam = p(SIMTCoreKey).get
val config = DefaultCoalescerConfig.copy(
numLanes = simtParam.nLanes,
numLanes = simtParam.nMemLanes,
numOldSrcIds = simtParam.nSrcIds
)
val numLanes = simtParam.nLanes
val numLanes = simtParam.nMemLanes
val filename = param.tracefilename
// Need to explicitly generate clock domain; see rocket-chip 8881ccd

View File

@@ -12,8 +12,13 @@ import freechips.rocketchip.tilelink._
// TODO: find better place for these
// Note: numNewSrcId is not a part of CoreParam, because the SIMT core should be agnostic to how inflight coalesced request can be genertated
case class SIMTCoreParams(nLanes: Int = 4, nSrcIds: Int = 8)
case class SIMTCoreParams(
nWarps: Int = 4, // # of warps in the core
nCoreLanes: Int = 4, // # of SIMT threads in the core
nMemLanes: Int = 4, // # of memory lanes in the memory interface to the
// cache; relates to the LSU lanes
nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes
)
case class MemtraceCoreParams(
tracefilename: String = "undefined",
traceHasSource: Boolean = false
@@ -2325,7 +2330,7 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig)
// A dummy harness around the coalescer for use in VLSI flow.
// Should not instantiate any memtrace modules.
class DummyCoalescer(implicit p: Parameters) extends LazyModule {
val numLanes = p(SIMTCoreKey).get.nLanes
val numLanes = p(SIMTCoreKey).get.nMemLanes
val config = DefaultCoalescerConfig.copy(numLanes = numLanes)
val driver = LazyModule(new DummyDriver(config))
@@ -2362,7 +2367,7 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
// tracedriver --> coalescer --> tracelogger --> tlram
class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters)
extends LazyModule {
val numLanes = p(SIMTCoreKey).get.nLanes
val numLanes = p(SIMTCoreKey).get.nMemLanes
val config = DefaultCoalescerConfig.copy(numLanes = numLanes)
val driver = LazyModule(new MemTraceDriver(config, filename))
@@ -2454,7 +2459,7 @@ class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit
// tracedriver --> coalescer --> tlram
class TLRAMCoalescer(implicit p: Parameters) extends LazyModule {
val numLanes = p(SIMTCoreKey).get.nLanes
val numLanes = p(SIMTCoreKey).get.nMemLanes
val config = DefaultCoalescerConfig.copy(numLanes = numLanes)
val filename = "vecadd.core1.thread4.trace"

View File

@@ -8,7 +8,7 @@ import freechips.rocketchip.subsystem.{BaseSubsystemConfig}
import freechips.rocketchip.devices.tilelink._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.util._
import radiance.subsystem.WithSimtLanes
import radiance.subsystem.WithSimtConfig
import freechips.rocketchip.unittest._
//import rocket.VortexFatBankTest
@@ -27,7 +27,7 @@ class WithCoalescingUnitTests extends Config((site, _, _) => {
// Module(new TLRAMCoalescerLoggerTest(filename="sfilter.core1.thread4.trace", timeout=timeout)),
// Module(new TLRAMCoalescerLoggerTest(filename="nearn.core1.thread4.trace", timeout=50000000 * site(TestDurationMultiplier))),
// Module(new TLRAMCoalescerLoggerTest(filename="psort.core1.thread4.trace", timeout=timeout)),
// Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.trace", timeout=timeout)(new WithSimtLanes(32))),
// Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.trace", timeout=timeout)(new WithSimtConfig(32))),
// Module(new TLRAMCoalescerLoggerTest(filename="nvbit.vecadd.n100000.filter_sm0.lane4.trace", timeout=timeout)),
) }
})
@@ -48,12 +48,12 @@ class WithCoalescingUnitSynthesisDummy(nLanes: Int) extends Config((site, _, _)
implicit val p = q
val timeout = 50000 * site(TestDurationMultiplier)
Seq(
Module(new DummyCoalescerTest(timeout=timeout)(new WithSimtLanes(nLanes=4))),
Module(new DummyCoalescerTest(timeout=timeout)(new WithSimtConfig(nMemLanes=4))),
) }
})
class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtLanes(nLanes=4) ++ new BaseSubsystemConfig)
//class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtLanes(nLanes=4) ++ new BaseSubsystemConfig)
class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nMemLanes=4) ++ new BaseSubsystemConfig)
//class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nLanes=4) ++ new BaseSubsystemConfig)
// Dummy configs of various sizes for synthesis
class CoalescingSynthesisDummyLane4Config extends Config(new WithCoalescingUnitSynthesisDummy(4) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)

View File

@@ -10,6 +10,7 @@ import org.chipsalliance.cde.config.{Parameters, Field}
case object VortexL1Key extends Field[Option[VortexL1Config]](None /*default*/ )
case class VortexL1Config(
cacheSize: Int, // total cache size in bytes
numBanks: Int,
wordSize: Int, // This is the read/write granularity of the L1 cache
cacheLineSize: Int,
@@ -34,6 +35,7 @@ case class VortexL1Config(
object defaultVortexL1Config
extends VortexL1Config(
cacheSize = 16384,
numBanks = 4,
wordSize = 16,
cacheLineSize = 16,
@@ -98,7 +100,7 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
TLMasterPortParameters.v1(
clients = Seq(
TLMasterParameters.v1(
name = "VortexBank",
name = "VortexBankPassthrough",
sourceId = IdRange(
0,
1 << (log2Ceil(
@@ -173,7 +175,7 @@ class VortexBank(
TLMasterPortParameters.v1(
clients = Seq(
TLMasterParameters.v1(
name = "VortexBank",
name = s"VortexBank${bankId}",
sourceId = IdRange(0, config.memSideSourceIds),
supportsProbe = TransferSizes(1, config.wordSize),
supportsGet = TransferSizes(1, config.wordSize),
@@ -203,6 +205,8 @@ class VortexBankImp(
val vxCache = Module(
new VX_cache_top(
WORD_SIZE = config.wordSize,
// distribute total size across numBanks
CACHE_SIZE = config.cacheSize / config.numBanks,
CACHE_LINE_SIZE = config.cacheLineSize,
CORE_TAG_WIDTH = config.coreTagPlusSizeWidth,
MSHR_SIZE = config.mshrSize
@@ -389,7 +393,7 @@ class VortexBankImp(
class VX_cache_top(
// these values should match the default settings in Verilog
// TODO: INSTANCE_ID
CACHE_SIZE: Int = 16384 / 4, // <FIXME, divided by 4 for faster simulation
CACHE_SIZE: Int = 16384,
CACHE_LINE_SIZE: Int = 16,
NUM_WAYS: Int = 4,
// for single-bank configuration, set NUM_REQS = 1 and instead set
@@ -408,10 +412,10 @@ class VX_cache_top(
) extends BlackBox(
Map(
// NOTE: NUM_REQS is analogous to SIMD width, whereas NUM_BANKS is the
// actual number of banks. VX_cache.sv instantiates VX_stream_xbar
// that arbitrates the higher NUM_REQS into NUM_BANKS. Since we do
// that logic ourselves using TL units, fix those params to 1 for the
// Verilog side.
// actual number of banks. In the original Vortex code, VX_cache has
// VX_stream_xbar that arbitrates the incoming NUM_REQS into outgoing
// NUM_BANKS. Since we do that logic ourselves using TL Xbars, fix
// those params to 1 for Verilog.
"NUM_REQS" -> 1,
"CACHE_SIZE" -> CACHE_SIZE,
"LINE_SIZE" -> CACHE_LINE_SIZE,
@@ -606,7 +610,7 @@ class NewSourceGenerator[T <: Data](
oldestMetadata := occupancyTable(oldestIndex).meta
oldestAge := occupancyTable(oldestIndex).age
assert(
oldestAge <= 2000.U,
oldestAge <= 10000.U,
"One id in the SourceGen is not released for long time, potential bug !"
)

View File

@@ -13,10 +13,12 @@ import radiance.memory._
class WithRadianceCores(
n: Int,
location: HierarchicalLocation,
crossing: RocketCrossingParams,
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem), site)
case TilesLocated(`location`) => {
val prev = up(TilesLocated(`location`), site)
val idOffset = prev.size
val vortex = RadianceTileParams(
core = VortexCoreParams(fpu = None),
@@ -43,10 +45,19 @@ class WithRadianceCores(
blockBytes = site(CacheBlockBytes))))
List.tabulate(n)(i => RadianceTileAttachParams(
vortex.copy(tileId = i + idOffset),
RocketCrossingParams()
crossing
)) ++ prev
}
})
}) {
def this(n: Int, location: HierarchicalLocation = InSubsystem, useVxCache: Boolean = false) = this(n, location, RocketCrossingParams(
master = HierarchicalElementMasterPortParams.locationDefault(location),
slave = HierarchicalElementSlavePortParams.locationDefault(location),
mmioBaseAddressPrefixWhere = location match {
case InSubsystem => CBUS
case InCluster(clusterId) => CCBUS(clusterId)
}
), useVxCache)
}
class WithFuzzerCores(
n: Int,
@@ -65,11 +76,33 @@ class WithFuzzerCores(
}
})
class WithRadianceCluster(
clusterId: Int,
location: HierarchicalLocation = InSubsystem,
crossing: RocketCrossingParams = RocketCrossingParams() // TODO make this not rocket
) extends Config((site, here, up) => {
case ClustersLocated(`location`) => up(ClustersLocated(location)) :+ RadianceClusterAttachParams(
RadianceClusterParams(clusterId = clusterId),
crossing)
case TLNetworkTopologyLocated(InCluster(`clusterId`)) => List(
ClusterBusTopologyParams(
clusterId = clusterId,
csbus = site(SystemBusKey),
ccbus = site(ControlBusKey).copy(errorDevice = None),
coherence = site(ClusterBankedCoherenceKey(clusterId))
)
)
case PossibleTileLocations => up(PossibleTileLocations) :+ InCluster(clusterId)
})
// `nSrcIds`: number of source IDs for dmem requests on each SIMT lane
class WithSimtLanes(nLanes: Int, nSrcIds: Int = 8) extends Config((site, _, up) => {
class WithSimtConfig(nWarps: Int = 4, nCoreLanes: Int = 4, nMemLanes: Int = 4, nSrcIds: Int = 8)
extends Config((site, _, up) => {
case SIMTCoreKey => {
Some(up(SIMTCoreKey, site).getOrElse(SIMTCoreParams()).copy(
nLanes = nLanes,
nWarps = nWarps,
nCoreLanes = nCoreLanes,
nMemLanes = nMemLanes,
nSrcIds = nSrcIds
))
}
@@ -105,7 +138,7 @@ class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config((site, _, up) => {
case CoalescerKey => {
val (nLanes, numOldSrcIds) = up(SIMTCoreKey, site) match {
case Some(param) => (param.nLanes, param.nSrcIds)
case Some(param) => (param.nMemLanes, param.nSrcIds)
case None => (1,1)
}
@@ -182,4 +215,4 @@ class WithExtGPUMem(address: BigInt = BigInt("0x100000000", 16),
})
})
case class GPUMemParams(address: BigInt = BigInt("0x100000000", 16), size: BigInt = 0x80000000)
case class GPUMemory() extends Field[Option[GPUMemParams]](None)
case class GPUMemory() extends Field[Option[GPUMemParams]](None)

View File

@@ -9,3 +9,10 @@ case class RadianceTileAttachParams(
tileParams: RadianceTileParams,
crossingParams: RocketCrossingParams
) extends CanAttachTile { type TileType = RadianceTile }
case class RadianceClusterAttachParams (
clusterParams: RadianceClusterParams,
crossingParams: HierarchicalElementCrossingParamsLike
) extends CanAttachCluster {
type ClusterType = RadianceCluster
}

View File

@@ -0,0 +1,98 @@
// See LICENSE.SiFive for license details.
// See LICENSE.Berkeley for license details.
package radiance.tile
import chisel3._
import chisel3.experimental.SourceInfo
import chisel3.util._
import org.chipsalliance.cde.config.{Field, Parameters}
import freechips.rocketchip.subsystem._
import freechips.rocketchip.diplomacy._
case class EmptyParams()
case class BarrierParams(
barrierIdBits: Int,
numCoreBits: Int
)
class BarrierRequestBits(
param: BarrierParams
) extends Bundle {
val barrierId = UInt(param.barrierIdBits.W)
val sizeMinusOne = UInt(param.numCoreBits.W)
val coreId = UInt(param.numCoreBits.W)
}
class BarrierResponseBits(
param: BarrierParams
) extends Bundle {
val barrierId = UInt(param.barrierIdBits.W)
}
class BarrierBundle(param: BarrierParams) extends Bundle {
val req = Decoupled(new BarrierRequestBits(param))
val resp = Flipped(Decoupled(new BarrierResponseBits(param)))
}
// FIXME Separate BarrierEdgeParams from BarrierParams
object BarrierNodeImp extends SimpleNodeImp[BarrierParams, EmptyParams, BarrierParams, BarrierBundle] {
def edge(pd: BarrierParams, pu: EmptyParams, p: Parameters, sourceInfo: SourceInfo) = {
// barrier parameters flow strictly downward from the master node
pd
}
def bundle(e: BarrierParams) = new BarrierBundle(e)
// FIXME render
def render(e: BarrierParams) = RenderedEdge(colour = "ffffff", label = "X")
}
case class BarrierMasterNode(val srcParams: BarrierParams)(implicit valName: ValName)
extends SourceNode(BarrierNodeImp)(Seq(srcParams))
case class BarrierSlaveNode(val numEdges: Int)(implicit valName: ValName)
extends SinkNode(BarrierNodeImp)(Seq.fill(numEdges)(EmptyParams()))
class BarrierSynchronizer(param: BarrierParams) extends Module {
val numBarrierIds = 1 << param.barrierIdBits
val numCores = 1 << param.numCoreBits
println(s"numBarrierIds: ${numBarrierIds}, numCores: ${numCores}")
val io = IO(new Bundle {
val reqs = Vec(numCores, Flipped(Decoupled(new BarrierRequestBits(param))))
val resp = Decoupled(new BarrierResponseBits(param))
})
// 2-dimensional table of per-id, per-core "done" signals
val table = RegInit(VecInit(Seq.fill(numBarrierIds)(VecInit(Seq.fill(numCores)(false.B)))))
val done = Wire(Vec(numBarrierIds, Bool()))
table.zipWithIndex.foreach { case (row, i) =>
done(i) := row.reduce(_ && _)
}
dontTouch(done)
io.reqs.zipWithIndex.foreach { case (req, coreId) =>
// always ready; all this module does is latch to boolean regs
req.ready := true.B
when(req.fire) {
assert(coreId.U === req.bits.coreId)
// FIXME: don't need coreId to be hardware here
table(req.bits.barrierId)(coreId.U) := true.B
}
}
val doneArbiter = Module(new RRArbiter(Bool(), numBarrierIds))
(doneArbiter.io.in zip done).zipWithIndex.foreach { case ((in, d), i) =>
in.valid := d
in.bits := d
when(in.fire) {
table(i).foreach(_ := false.B)
}
}
io.resp.valid := doneArbiter.io.out.valid
io.resp.bits.barrierId := doneArbiter.io.chosen
when(io.resp.fire) {
table(io.resp.bits.barrierId).foreach(_ := false.B)
}
doneArbiter.io.out.ready := io.resp.ready
}

View File

@@ -60,7 +60,7 @@ class FuzzerTile private (
// val statusNode = BundleBridgeSource(() => new GroundTestStatus)
val (numLanes, numSrcIds) = p(SIMTCoreKey) match {
case Some(param) => (param.nLanes, param.nSrcIds)
case Some(param) => (param.nMemLanes, param.nSrcIds)
case None => {
require(false, "fuzzer requires SIMTCoreKey to be defined")
(0, 0)

View File

@@ -0,0 +1,150 @@
// See LICENSE.SiFive for license details.
// See LICENSE.Berkeley for license details.
package radiance.tile
import chisel3._
import chisel3.experimental.SourceInfo
import chisel3.util._
import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.subsystem._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.diplomacy.{LazyModule, AddressSet, SimpleDevice, ClockCrossingType}
import freechips.rocketchip.regmapper.RegField
import freechips.rocketchip.prci.ClockSinkParameters
case class RadianceClusterParams(
val clusterId: Int,
val clockSinkParams: ClockSinkParameters = ClockSinkParameters()
) extends InstantiableClusterParams[RadianceCluster] {
val baseName = "radiance_cluster"
val uniqueName = s"${baseName}_$clusterId"
def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByClusterIdImpl)(implicit p: Parameters): RadianceCluster = {
new RadianceCluster(this, crossing.crossingType, lookup)
}
}
class RadianceCluster (
thisClusterParams: RadianceClusterParams,
crossing: ClockCrossingType,
lookup: LookupByClusterIdImpl
)(implicit p: Parameters) extends Cluster(thisClusterParams, crossing, lookup) {
// cluster-local bus, used for shared memory traffic that never leaves the
// confines of a cluster
val clbus = tlBusWrapperLocationMap(CLBUS(clusterId))
clbus.clockGroupNode := allClockGroupsNode
// Instantiate cluster-local shared memory scratchpad
//
// Instantiate the same number of banks as there are lanes.
val numLsuLanes = 4 // FIXME: hardcoded
val wordSize = 4
val smemBanks = Seq.tabulate(numLsuLanes) { bankId =>
// Banked-by-word (4 bytes)
// base for bank 1: ff...000000|01|00
// mask for bank 1; 00...111111|00|11
val base = 0xff000000L | (bankId * wordSize)
val mask = 0x00001fffL ^ ((numLsuLanes - 1) * wordSize)
LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = wordSize))
}
smemBanks.foreach(_.node := clbus.outwardNode)
val numCores = leafTiles.size
// Diplomacy sink nodes for cluster-wide barrier sync signal
val barrierSlaveNode = BarrierSlaveNode(numCores)
// HACK: This is a workaround of the CanAttachTile bus connecting API that
// works by downcasting tile and directly accessing the node inside that is
// not exposed as a master in HierarchicalElementCrossingParamsLike.
// val tile = leafTiles(0).asInstanceOf[RadianceTile]
// val perSmemPortXbars = Seq.fill(tile.smemNodes.size) { LazyModule(new TLXbar) }
// Tie corresponding smem ports from every tile into a single port using
// Xbars so that the number of ports going into the sharedmem do not scale
// with the number of tiles.
leafTiles.foreach { case (id, tile: RadianceTile) =>
// (perSmemPortXbars zip tile.smemNodes).foreach {
// case (xbar, node) => xbar.node := node
// }
tile.smemNodes.foreach(clbus.inwardNode := _)
barrierSlaveNode := tile.barrierMasterNode
}
// perSmemPortXbars.foreach { clbus.inwardNode := _.node }
// Memory-mapped register for barrier sync
val regDevice = new SimpleDevice("radiance-cluster-barrier-reg",
Seq(s"radiance-cluster-barrier-reg${clusterId}"))
val regNode = TLRegisterNode(
address = Seq(AddressSet(0xff003f00L, 0xff)),
device = regDevice,
beatBytes = wordSize,
concurrency = 1)
regNode := clbus.outwardNode
nodes.foreach({ node =>
println(s"======= RadianceCluster node.name: ${node.name}")
})
override lazy val module = new RadianceClusterModuleImp(this)
}
class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp(outer) {
outer.leafTiles.foreach { case (id, tile: RadianceTile) =>
// println(s"======= RadianceCluster: tile.smemXbar.node.edge = ${tile.smemXbar.node.out.size}")
println(s"======= RadianceCluster: clbus inward edges = ${outer.clbus.inwardNode.inward.inputs.length}")
println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}")
}
val numBarriers = 4 // FIXME: hardcoded
// @cleanup: This assumes barrier params on all edges are the same, i.e. all
// cores are configured to have the same barrier id range. While true, might
// be better to actually assert this
val barrierParam = outer.barrierSlaveNode.in(0)._2
val synchronizer = Module(new BarrierSynchronizer(barrierParam))
(synchronizer.io.reqs zip outer.barrierSlaveNode.in).foreach { case (req, (b, _)) =>
req <> b.req
b.resp <> synchronizer.io.resp // broadcast
}
// outer.barrierSlaveNode.in.foreach { case (b, e) =>
// val fakeBarrierRespId = RegNext(b.req.bits.barrierId)
// val fakeBarrierRespValid = RegNext(b.req.fire)
// b.req.ready := true.B // barrier module is always ready
// b.resp.valid := fakeBarrierRespValid
// b.resp.bits.barrierId := fakeBarrierRespId
// }
val allSyncedRegs = Seq.fill(numBarriers)(Wire(UInt(32.W)))
val perCoreSyncedRegs = Seq.fill(numBarriers)(Seq.fill(outer.numCores)(RegInit(0.U(32.W))))
(allSyncedRegs zip perCoreSyncedRegs).foreach{ case (all, per) =>
all := per.reduce((x0, x1) => (x0 =/= 0.U) && (x1 =/= 0.U))
val allPassed = per.map(_ === 2.U).reduce(_ && _)
when(allPassed) {
per.foreach(_ := 0.U)
}
dontTouch(all)
}
// FIXME: 4 cores per cluster hardcoded
outer.regNode.regmap(
0x00 -> Seq(RegField.r(32, allSyncedRegs(0))),
0x04 -> Seq(RegField(32, perCoreSyncedRegs(0)(0))),
0x08 -> Seq(RegField(32, perCoreSyncedRegs(0)(1))),
0x10 -> Seq(RegField.r(32, allSyncedRegs(1))),
0x14 -> Seq(RegField(32, perCoreSyncedRegs(1)(0))),
0x18 -> Seq(RegField(32, perCoreSyncedRegs(1)(1))),
0x20 -> Seq(RegField.r(32, allSyncedRegs(2))),
0x24 -> Seq(RegField(32, perCoreSyncedRegs(2)(0))),
0x28 -> Seq(RegField(32, perCoreSyncedRegs(2)(1))),
0x30 -> Seq(RegField.r(32, allSyncedRegs(3))),
0x34 -> Seq(RegField(32, perCoreSyncedRegs(3)(0))),
0x38 -> Seq(RegField(32, perCoreSyncedRegs(3)(1))),
)
println(s"======== barrierSlaveNode: ${outer.barrierSlaveNode.in(0)._2.barrierIdBits}")
}

View File

@@ -140,10 +140,21 @@ class RadianceTile private (
require(
p(SIMTCoreKey).isDefined,
"SIMTCoreKey not defined; make sure to use WithSimtLanes when using RadianceTile"
"SIMTCoreKey not defined; make sure to use WithSimtConfig when using RadianceTile"
)
val numLanes = p(SIMTCoreKey) match {
case Some(simtParam) => simtParam.nLanes
// NOTE: when changing these, remember to change +define+NUM_CORES/THREADS/WARPS in
// radiance.mk as well!
val numWarps = p(SIMTCoreKey) match {
case Some(simtParam) => simtParam.nWarps
case None => 4
}
val numCoreLanes = p(SIMTCoreKey) match {
case Some(simtParam) => simtParam.nCoreLanes
case None => 4
}
val numLsuLanes = p(SIMTCoreKey) match {
case Some(simtParam) => simtParam.nMemLanes
case None => 4
}
@@ -170,13 +181,14 @@ class RadianceTile private (
val smemSourceWidth = 4 // FIXME: hardcoded
val numWarps = 4 // TODO: parametrize
// Replicates some of the logic of how Vortex determines the tag width of
// memory requests so that Chisel and Verilog are in agreement on bitwidths.
// See VX_gpu_pkg.sv
val NW_WIDTH = (if (numWarps == 1) 1 else log2Ceil(numWarps))
val UUID_WIDTH = 44
val imemTagWidth = UUID_WIDTH + NW_WIDTH
val numLsuLanes = 4
// see VX_gpu_pkg.sv
val LSUQ_SIZE = 8 * (numLanes / numLsuLanes)
val LSUQ_SIZE = 8 * (numCoreLanes / numLsuLanes)
val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/
val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS
// dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH
@@ -291,15 +303,13 @@ class RadianceTile private (
// Conditionally instantiate L1 cache
val (icacheNode, dcacheNode): (TLNode, TLNode) = p(VortexL1Key) match {
case Some(vortexL1Config) => {
println(
s"============ Using Vortex L1 cache ================="
)
println("VortexL1Cache instantiated")
// require(
// p(CoalescerKey).isDefined,
// "Vortex L1 configuration currently only works when coalescer is also enabled."
// )
val icache = LazyModule(new VortexL1Cache(vortexL1Config))
val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy(numBanks = 1)))
val dcache = LazyModule(new VortexL1Cache(vortexL1Config))
// imemNodes.foreach { icache.coresideNode := TLWidthWidget(4) := _ }
assert(imemNodes.length == 1) // FIXME
@@ -316,22 +326,10 @@ class RadianceTile private (
}
}
// Instantiate sharedmem banks
//
// Instantiate the same number of banks as there are lanes.
// TODO: parametrize
// val smemBanks = Seq.tabulate(numLsuLanes) { bankId =>
// // Banked-by-word (4 bytes)
// // base for bank 1: ff...000000|01|00
// // mask for bank 1; 00...111111|00|11
// val base = 0xff000000L | (bankId * 4 /*wordSize*/ )
// val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ )
// LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = 4 /*wordSize*/ ))
// }
// smem lanes-to-banks crossbar
val smemXbar = LazyModule(new TLXbar)
smemNodes.foreach(smemXbar.node := _)
// smemBanks.foreach(_.node := smemXbar.node)
// Barrier synchronization node
// FIXME: hardcoded params
val barrierParams = BarrierParams(barrierIdBits = 2, numCoreBits = 1)
val barrierMasterNode = BarrierMasterNode(barrierParams)
val base = p(GPUMemory()) match {
case Some(GPUMemParams(baseAddr, _)) => baseAddr
@@ -346,7 +344,6 @@ class RadianceTile private (
tlMasterXbar.node :=* AddressOrNode(base) :=* dcacheNode
}
// ROCC
// TODO: parametrize
val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig.copy(
@@ -371,14 +368,14 @@ class RadianceTile private (
tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode
// MMIO
gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node
// gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node
// sharedmem access
//
// FIXME: gemmini spad has 16B data width; core smem interface has 4B. Need
// to consolidate by either coalescing, or changing gemmini spad to
// strided-by-word
gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node
TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node
// gemmini.unified_mem_node :=* TLWidthWidget(4) :=* smemXbar.node
// TLRAM(AddressSet(x"ff004000", 0xfff)) := TLFragmenter(4, 4) := smemXbar.node
/* below are copied from rocket */
@@ -462,6 +459,10 @@ class RadianceTileModuleImp(outer: RadianceTile)
extends BaseTileModuleImp(outer) {
Annotated.params(this, outer.radianceParams)
auto.elements.foreach({case (name, _) =>
println(s"======= RadianceTile.elements.name: ${name}")
})
val core = Module(new Vortex(outer)(outer.p))
core.io.clock := clock
@@ -532,6 +533,11 @@ class RadianceTileModuleImp(outer: RadianceTile)
// TODO: make imemNodes not a vector
imemTLAdapter.io.inReq <> core.io.imem.get(0).a
core.io.imem.get(0).d <> imemTLAdapter.io.inResp
performanceCounters(Seq(imemTLAdapter.io.inReq), Seq(imemTLAdapter.io.inResp),
desc = s"core${outer.tileId}-imem")
// now connect TL adapter downstream ports to the tile egress ports
outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq
imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d
}
@@ -629,6 +635,10 @@ class RadianceTileModuleImp(outer: RadianceTile)
}
core.io.dmem_d_valid := dmem_d_valid_vec.asUInt
performanceCounters(dmemTLAdapters.map(_.io.inReq), dmemTLAdapters.map(_.io.inResp),
desc = s"core${outer.tileId}-dmem")
// now connect TL adapter downstream ports to the tile egress ports
(dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) =>
tlOut.a <> tlAdapter.io.outReq
tlAdapter.io.outResp <> tlOut.d
@@ -678,47 +688,114 @@ class RadianceTileModuleImp(outer: RadianceTile)
tlAdapter.io.inResp.ready := core.io.smem_d_ready(i)
}
performanceCounters(smemTLAdapters.map(_.io.inReq), smemTLAdapters.map(_.io.inResp),
desc = s"core${outer.tileId}-smem")
// now connect TL adapter downstream ports to the tile egress ports
(smemTLAdapters zip smemTLBundles) foreach { case (tlAdapter, tlOut) =>
tlOut.a <> tlAdapter.io.outReq
tlAdapter.io.outResp <> tlOut.d
}
}
def connectBarrier = {
require(outer.barrierMasterNode.out.length == 1)
// FIXME: bits not flattened
outer.barrierMasterNode.out(0)._1.req.valid := core.io.gbar_req_valid
outer.barrierMasterNode.out(0)._1.req.bits.barrierId := core.io.gbar_req_id
outer.barrierMasterNode.out(0)._1.req.bits.coreId := core.io.gbar_req_core_id
core.io.gbar_req_ready := outer.barrierMasterNode.out(0)._1.req.ready
core.io.gbar_rsp_valid := outer.barrierMasterNode.out(0)._1.resp.valid
core.io.gbar_rsp_id := outer.barrierMasterNode.out(0)._1.resp.bits.barrierId
// core doesn't have a resp.ready port
outer.barrierMasterNode.out(0)._1.resp.ready := true.B
}
def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]],
respBundles: Seq[DecoupledIO[VortexBundleD]],
desc: String) = {
val currentPendingReqs = RegInit(SInt(32.W), 0.S)
val pendingReqsCumulative = RegInit(SInt(32.W), 0.S)
val totalReqs = RegInit(UInt(32.W), 0.U)
val reqFireCountPerCycle = Wire(UInt(32.W))
val respFireCountPerCycle = Wire(UInt(32.W))
val reqReadFires = reqBundles.map { b => b.fire && b.bits.opcode === 4.U /* Get */ }
val respReadFires = respBundles.map { b => b.fire && b.bits.opcode === 1.U /* AccessAckData */}
reqFireCountPerCycle := PopCount(reqReadFires)
respFireCountPerCycle := PopCount(respReadFires)
totalReqs := totalReqs + reqFireCountPerCycle
val diffPendingReqs = reqFireCountPerCycle.asSInt - respFireCountPerCycle.asSInt
currentPendingReqs := currentPendingReqs + diffPendingReqs
pendingReqsCumulative := pendingReqsCumulative + currentPendingReqs
val prevFinished = RegNext(core.io.finished)
val justFinished = !prevFinished && core.io.finished
when (justFinished) {
printf(s"PERF: ${desc}: average request latency (cum_pending / total): %d / %d\n",
pendingReqsCumulative, totalReqs)
}
dontTouch(totalReqs)
dontTouch(diffPendingReqs)
dontTouch(currentPendingReqs)
dontTouch(pendingReqsCumulative)
}
connectImem
connectDmem
connectSmem
connectBarrier
}
// TODO: generalize for useVxCache
if (!outer.radianceParams.useVxCache) {}
// RoCC
if (outer.roccs.size > 0) {
val (respArb, cmdRouter) = {
val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size))
val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p))
outer.roccs.zipWithIndex.foreach { case (rocc, i) =>
// ptwPorts ++= rocc.module.io.ptw
rocc.module.io.ptw <> DontCare
rocc.module.io.mem <> DontCare
rocc.module.io.cmd <> cmdRouter.io.out(i)
respArb.io.in(i) <> Queue(rocc.module.io.resp)
}
// Create this FPU just for RoCC
// val nFPUPorts = outer.roccs.filter(_.usesFPU).size
val fp_rocc_ios = outer.roccs.map(_.module.io)
fp_rocc_ios.map { io =>
io.fpu_req.ready := false.B
io.fpu_resp.valid := false.B
io.fpu_resp.bits := DontCare
}
(respArb, cmdRouter)
}
// // RoCC
// if (outer.roccs.size > 0) {
// val (respArb, cmdRouter) = {
// val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size))
// val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p))
// outer.roccs.zipWithIndex.foreach { case (rocc, i) =>
// // ptwPorts ++= rocc.module.io.ptw
// rocc.module.io.ptw <> DontCare
// rocc.module.io.mem <> DontCare
// rocc.module.io.cmd <> cmdRouter.io.out(i)
// respArb.io.in(i) <> Queue(rocc.module.io.resp)
// }
// // Create this FPU just for RoCC
// // val nFPUPorts = outer.roccs.filter(_.usesFPU).size
// val fp_rocc_ios = outer.roccs.map(_.module.io)
// fp_rocc_ios.map { io =>
// io.fpu_req.ready := false.B
// io.fpu_resp.valid := false.B
// io.fpu_resp.bits := DontCare
// }
// (respArb, cmdRouter)
// }
cmdRouter.io.in <> DontCare
outer.roccs.foreach(_.module.io.exception := DontCare)
respArb.io.out <> DontCare
}
// cmdRouter.io.in <> DontCare
// outer.roccs.foreach(_.module.io.exception := DontCare)
// respArb.io.out <> DontCare
// }
}
class ClusterSynchronizer(
barrierIdWidth: Int,
numCoreWidth: Int,
) extends Module {
val io = IO(new Bundle {
val req = Flipped(Decoupled(new Bundle {
val barrierId = UInt(barrierIdWidth.W)
val sizeMinusOne = UInt(numCoreWidth.W)
val coreId = UInt(numCoreWidth.W)
}))
val resp = Decoupled(new Bundle {
val barrierId = UInt(barrierIdWidth.W)
})
})
}
// Some @copypaste from CoalescerSourceGen.
@@ -768,7 +845,6 @@ class VortexTLAdapter(
io.outReq.bits.corrupt := 0.U
io.inReq.ready := io.outReq.ready
// VortexBundleD <> TLBundleD
// Filtering out write requests is handled inside the wrapper Verilog
io.inResp.valid := io.outResp.valid
io.inResp.bits.opcode := io.outResp.bits.opcode
io.inResp.bits.size := io.outResp.bits.size

View File

@@ -41,18 +41,11 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl
val interrupts = Input(new freechips.rocketchip.rocket.CoreInterrupts(false/*hasBeu*/))
// conditionally instantiate ports depending on whether we want to use VX_cache or not
// TODO: flatten this like dmem and smem
val imem = if (!tile.radianceParams.useVxCache) Some(Vec(1, new Bundle {
val a = Decoupled(new VortexBundleA(tagWidth = tile.imemTagWidth, dataWidth = 32))
val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.imemTagWidth, dataWidth = 32)))
})) else None
val dmem = if (!tile.radianceParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle {
// val a = Decoupled(new VortexBundleA(tagWidth = tile.dmemTagWidth, dataWidth = 32))
// val d = Flipped(Decoupled(new VortexBundleD(tagWidth = dmemTagWidth, dataWidth = 32)))
})) else None
val smem = if (!tile.radianceParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle {
// val a = Decoupled(new VortexBundleA(tagWidth = tile.smemTagWidth, dataWidth = 32))
// val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.smemTagWidth, dataWidth = 32)))
})) else None
val mem = if (tile.radianceParams.useVxCache) Some(new Bundle {
val a = Decoupled(new VortexBundleA(tagWidth = 15, dataWidth = 128))
val d = Flipped(Decoupled(new VortexBundleD(tagWidth = 15, dataWidth = 128)))
@@ -96,6 +89,17 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl
val smem_d_bits_data = Input(UInt((tile.numLsuLanes * 32).W))
val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W))
// FIXME: hardcoded
val NB_WIDTH = 2
val NC_WIDTH = 1
val gbar_req_valid = Output(Bool())
val gbar_req_id = Output(UInt(NB_WIDTH.W))
val gbar_req_size_m1 = Output(UInt(NC_WIDTH.W))
val gbar_req_core_id = Output(UInt(NC_WIDTH.W))
val gbar_req_ready = Input(Bool())
val gbar_rsp_valid = Input(Bool())
val gbar_rsp_id = Input(UInt(NB_WIDTH.W))
// val fpu = Flipped(new FPUCoreIO())
//val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs))
//val trace = Output(new TraceBundle)
@@ -112,6 +116,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
// see VX_csr_data that implements the read logic for CSR_MHARTID/GWID.
Map(
"CORE_ID" -> tile.tileParams.tileId,
"CORES_PER_CLUSTER" -> 2, // FIXME: hardcoded
// TODO: can we get this as a parameter?
"BOOTROM_HANG100" -> 0x10100,
"NUM_THREADS" -> tile.numLsuLanes
@@ -194,10 +199,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
// addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_tags.sv")
// addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_wrap.sv")
// gbar is only used in the socket/cluster hierarchy
// addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv")
// addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_bus_if.sv")
// addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv")
// mem_arb is used in VX_socket or VX_cache_cluster
// addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_arb.sv")
addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_bus_if.sv")
@@ -217,6 +218,14 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
// addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_define.vh")
// addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_wrap.sv")
// used when PERF_ENABLE is defined
addResource("/vsrc/vortex/hw/rtl/mem/VX_mem_perf_if.sv")
addResource("/vsrc/vortex/hw/rtl/interfaces/VX_pipeline_perf_if.sv")
// used when GBAR_ENABLE is defined
addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_bus_if.sv")
// addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv")
// addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv")
// addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv")
// addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv")
@@ -245,6 +254,9 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
// unused addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_mux.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_pending_size.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_register.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_buffer.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_toggle_buffer.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_buffer.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_popcount.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_arbiter.sv")
addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_encoder.sv")