diff --git a/radiance.mk b/radiance.mk index 0617373..53b5864 100644 --- a/radiance.mk +++ b/radiance.mk @@ -11,6 +11,9 @@ RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release # EXTRA_SIM_REQS += radpie EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie +ifeq ($(shell echo $(CONFIG) | grep -E "SynConfig$$"),$(CONFIG)) + EXTRA_SIM_PREPROC_DEFINES += +define+SYNTHESIS +define+NDEBUG +define+DPI_DISABLE +endif EXTRA_SIM_PREPROC_DEFINES += \ +define+SIMULATION \ +define+GPR_RESET \ @@ -21,12 +24,11 @@ EXTRA_SIM_PREPROC_DEFINES += \ +define+GBAR_ENABLE \ +define+GBAR_CLUSTER_ENABLE \ +define+NUM_FPU_BLOCKS=2 \ - +define+NUM_BARRIERS=4 \ - +define+NUM_LSU_LANES=4 \ - +define+NUM_CORES=1 +define+NUM_THREADS=32 +define+NUM_WARPS=4 - # +define+EXT_T_DISABLE \ - # +define+FPU_FPNEW \ - # +define+SMEM_LOG_SIZE=15 \ + +define+EXT_T_DISABLE \ + +define+FPU_FPNEW \ + +define+SMEM_LOG_SIZE=17 + +VCS_NONCC_OPTS += +vcs+initreg+random # cargo handles building of Rust files all on its own, so make this a PHONY # target to run cargo unconditionally diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index cfeff3f..f23db9c 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -1146,9 +1146,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) case (enqPort, uncoalResp) => { enqPort <> uncoalResp - when(!enqPort.ready) { - printf(s"respQueue: enq port for uncoalesced response is blocked on lane ${lane}\n") - } + // when(!enqPort.ready) { + // printf(s"respQueue: enq port for uncoalesced response is blocked on lane ${lane}\n") + // } } } } diff --git a/src/main/scala/radiance/memory/FrameBuffer.scala b/src/main/scala/radiance/memory/FrameBuffer.scala new file mode 100644 index 0000000..9ed090f --- /dev/null +++ b/src/main/scala/radiance/memory/FrameBuffer.scala @@ -0,0 +1,78 @@ +package radiance.memory + +import chisel3._ +import chisel3.util._ +import freechips.rocketchip.diplomacy.{AddressSet, SimpleDevice, TransferSizes} +import org.chipsalliance.diplomacy.lazymodule._ +import freechips.rocketchip.regmapper.RegField +import freechips.rocketchip.tilelink._ +import midas.targetutils.SynthesizePrintf +import org.chipsalliance.cde.config.Parameters + +class FrameBuffer(baseAddress: BigInt, width: Int, size: Int, validAddress: BigInt, fbName: String = "fb") + (implicit p: Parameters) extends LazyModule { + + val node = TLXbar() + + val bufferNode = TLManagerNode(Seq(TLSlavePortParameters.v1( + Seq(TLSlaveParameters.v2( + address = Seq(AddressSet(baseAddress, (1 << log2Ceil(size)) - 1)), + supports = TLMasterToSlaveTransferSizes( + putFull = TransferSizes(1, width), + putPartial = TransferSizes(1, width) + ), + fifoId = Some(0))), // requests are handled in order + beatBytes = width + ))) + + val regDevice = new SimpleDevice("framebuffer-valid-reg", Seq(s"framebuffer-valid-reg")) + val regNode = TLRegisterNode( + address = Seq(AddressSet(validAddress, 0x3)), device = regDevice, concurrency = 1) + + bufferNode := TLWidthWidget(4) := TLBuffer() := node + regNode := TLFragmenter(4, 4) := TLBuffer() := node + + val depth = size >> log2Ceil(width) + lazy val module = new LazyModuleImp(this) { + val bufT = Vec(width, UInt(8.W)) + val buf = SyncReadMem(depth, bufT) + val state = RegInit(false.B) // 0: accepting writes, 1: printing + + val Seq((bufBundle, bufEdge)) = bufferNode.in + + bufBundle.a.ready := !state && bufBundle.d.ready + bufBundle.d.bits := DontCare + bufBundle.d.valid := !state && bufBundle.a.valid + when (bufBundle.a.fire) { + bufBundle.d.bits := bufEdge.AccessAck(bufBundle.a.bits) + buf.write(((bufBundle.a.bits.address & (size - 1).U) >> log2Ceil(width)).asUInt, + bufBundle.a.bits.data.asTypeOf(bufT), + bufBundle.a.bits.mask.asBools) + } + + val writeValid = RegInit(0.U(32.W)) + val writeTotal = RegInit(0.U(32.W)) + regNode.regmap(0x00 -> Seq(RegField.w(32, writeValid))) + + // val (writeCounter, writeComplete) = Counter(state.asBool, size / width) + // when (writeValid(0)) { state := 1.U } + // when (writeComplete) { state := 0.U } + val writeCounter = Counter(depth) + when (writeValid > 0.U) { + writeValid := 0.U + writeTotal := writeValid + state := true.B + writeCounter.reset() + }.elsewhen (writeCounter.value === writeTotal - 1.U) { + state := false.B + } + + when (state) { writeCounter.inc() } + + val readData = buf.read(writeCounter.value, state) + val prevIdx = RegNext(writeCounter.value) + when (RegNext(state)) { + SynthesizePrintf(printf(s"$fbName %x %x\n", prevIdx, readData.asUInt)) + } + } +} diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 3302738..e7bc206 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -24,6 +24,17 @@ case class RadianceSharedMemKey(address: BigInt, serializeUnaligned: Boolean = true) case object RadianceSharedMemKey extends Field[Option[RadianceSharedMemKey]](None) +case class RadianceGemminiKey(tileSize: Int, + slaveAddress: BigInt) +case object RadianceGemminiKey extends Field[Option[RadianceGemminiKey]](None) + +case class RadianceFrameBufferKey(baseAddress: BigInt, + width: Int, + size: Int, + validAddress: BigInt, + fbName: String = "fb") +case object RadianceFrameBufferKey extends Field[Seq[RadianceFrameBufferKey]](Seq()) + class WithRadianceCores( n: Int, location: HierarchicalLocation, @@ -74,7 +85,7 @@ class WithRadianceCores( class WithRadianceGemmini(location: HierarchicalLocation, crossing: RocketCrossingParams, - dim: Int, accSizeInKB: Int) extends Config((site, _, up) => { + dim: Int, accSizeInKB: Int, tileSize: Int) extends Config((site, _, up) => { case TilesLocated(`location`) => { val prev = up(TilesLocated(`location`), site) val idOffset = prev.size @@ -106,8 +117,15 @@ class WithRadianceGemmini(location: HierarchicalLocation, crossing )) ++ prev } + case RadianceGemminiKey => { + val smKey = site(RadianceSharedMemKey).get + Some(RadianceGemminiKey( + tileSize = tileSize, + slaveAddress = smKey.address + smKey.size + 0x3000 + )) + } }) { - def this(location: HierarchicalLocation = InSubsystem, dim: Int, accSizeInKB: Int) = + def this(location: HierarchicalLocation = InSubsystem, dim: Int, accSizeInKB: Int, tileSize: Int) = this(location, RocketCrossingParams( master = HierarchicalElementMasterPortParams.locationDefault(location), slave = HierarchicalElementSlavePortParams.locationDefault(location), @@ -115,7 +133,7 @@ class WithRadianceGemmini(location: HierarchicalLocation, case InSubsystem => CBUS case InCluster(clusterId) => CCBUS(clusterId) } - ), dim, accSizeInKB) + ), dim, accSizeInKB, tileSize) } class WithRadianceSharedMem(address: BigInt, @@ -136,6 +154,18 @@ class WithRadianceSharedMem(address: BigInt, } }) +class WithRadianceFrameBuffer(baseAddress: BigInt, + width: Int, + size: Int, + validAddress: BigInt, + fbName: String = "fb") extends Config((_, _, up) => { + case RadianceFrameBufferKey => { + up(RadianceFrameBufferKey) ++ Seq( + RadianceFrameBufferKey(baseAddress, width, size, validAddress, fbName) + ) + } +}) + class WithFuzzerCores( n: Int, useVxCache: Boolean diff --git a/src/main/scala/radiance/tile/GemminiTile.scala b/src/main/scala/radiance/tile/GemminiTile.scala index 7fa6b54..6f63693 100644 --- a/src/main/scala/radiance/tile/GemminiTile.scala +++ b/src/main/scala/radiance/tile/GemminiTile.scala @@ -6,15 +6,18 @@ package radiance.tile import chisel3._ import chisel3.util._ import chisel3.experimental.BundleLiterals._ -import freechips.rocketchip.diplomacy.{BigIntHexContext, ClockCrossingType, DisableMonitors, LazyModule, SimpleDevice} +import org.chipsalliance.diplomacy.DisableMonitors +import org.chipsalliance.diplomacy.lazymodule._ +import freechips.rocketchip.diplomacy.{AddressSet, BigIntHexContext, ClockCrossingType, SimpleDevice} import freechips.rocketchip.prci.ClockSinkParameters +import freechips.rocketchip.regmapper.RegField import freechips.rocketchip.rocket._ import freechips.rocketchip.subsystem.{CanAttachTile, HierarchicalElementCrossingParamsLike, RocketCrossingParams} import freechips.rocketchip.tile._ import freechips.rocketchip.tilelink._ import gemmini._ import org.chipsalliance.cde.config.Parameters -import radiance.subsystem.{GPUMemParams, GPUMemory} +import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceGemminiKey} case class GemminiCoreParams( useVM: Boolean = false, @@ -120,10 +123,20 @@ class GemminiTile private ( // tlOtherMastersNode :=* AddressOrNode(base) :=* gemmini.tlNode tlMasterXbar.node :=* gemmini.atlNode tlOtherMastersNode :=* gemmini.tlNode - gemmini.stlNode := tlSlaveXbar.node + // gemmini.stlNode := tlSlaveXbar.node require(!gemmini.config.sp_singleported, "external scratchpad must be dual ported") + val configKey = p(RadianceGemminiKey).get + + val regDevice = new SimpleDevice("gemmini-cmd-reg", Seq(s"gemmini-cmd-reg")) + val regNode = TLRegisterNode( + address = Seq(AddressSet(configKey.slaveAddress, 0xfff)), + device = regDevice, + beatBytes = 8, + concurrency = 1) + regNode := TLFragmenter(8, 64) := tlSlaveXbar.node + override lazy val module = new GemminiTileModuleImp(this) } @@ -173,17 +186,22 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer) } ciscInst := 0.U.asTypeOf(ciscInstT) - // val boundsInst = ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U) - // val spadQuartile = 0x80 - val boundsInst = ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"8_00080008".U) - val spadQuartile = 0x200 + + val tileSize = outer.configKey.tileSize + val (boundsInst, spadQuartile) = if (tileSize == 4) { + (ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U), 0x80) + } else if (tileSize == 8) { + (ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"8_00080008".U), 0x200) + } else { + (ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> (tileSize | (tileSize << 16) | (tileSize << 32)).U), + tileSize * tileSize * outer.gemminiParams.gemminiConfig.DIM) + } when (ciscValid) { assert(!accSlave.cmd.valid, "cisc state machine already busy") switch (ciscId) { is (0.U) { - ciscInst := microcodeEntry(Seq( - ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"8_00080008".U), // set I, J, K - ciscInstT.Lit(_.inst -> 0x3020b07b.U, _.rs1 -> 0.U, _.rs2 -> 0x600.U), // set A, B address + ciscInst := microcodeEntry(Seq(boundsInst, + ciscInstT.Lit(_.inst -> 0x3020b07b.U, _.rs1 -> 0.U, _.rs2 -> (spadQuartile * 3).U), // set A, B address ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0.U, _.rs2 -> x"0_000002b8".U) // set skip, acc )) } @@ -234,11 +252,39 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer) } val gemminiIO = outer.gemmini.module.io.cmd + + val regValid = Wire(Bool()) + val regCommand = Wire(gemminiIO.bits.inst.cloneType) + val gemminiRs1RegLSB = RegInit(0.U(32.W)) + val gemminiRs1RegMSB = RegInit(0.U(32.W)) + val gemminiRs2RegLSB = RegInit(0.U(32.W)) + val gemminiRs2RegMSB = RegInit(0.U(32.W)) + + def gemminiCommandReg(valid: Bool, bits: UInt): Bool = { + regValid := valid + regCommand := bits.asTypeOf(regCommand) + gemminiIO.ready && !ciscValid + } + + outer.regNode.regmap( + 0x00 -> Seq(RegField.w(32, gemminiCommandReg(_, _))), + 0x10 -> Seq( + RegField.w(32, gemminiRs1RegLSB), + RegField.w(32, gemminiRs1RegMSB)), + 0x18 -> Seq( + RegField.w(32, gemminiRs2RegLSB), + RegField.w(32, gemminiRs2RegMSB)), + 0x20 -> Seq(RegField.r(32, outer.gemmini.module.io.busy)) + ) + + assert(!regValid || gemminiIO.ready) + assert(!ciscValid || gemminiIO.ready) + gemminiIO.bits.status := 0.U.asTypeOf(gemminiIO.bits.status) - gemminiIO.bits.inst := ciscInst.inst.asTypeOf(gemminiIO.bits.inst) - gemminiIO.bits.rs1 := ciscInst.rs1 - gemminiIO.bits.rs2 := ciscInst.rs2 - gemminiIO.valid := ciscValid + gemminiIO.bits.inst := Mux(ciscValid, ciscInst.inst.asTypeOf(gemminiIO.bits.inst), regCommand) + gemminiIO.bits.rs1 := Mux(ciscValid, ciscInst.rs1, Cat(gemminiRs1RegMSB, gemminiRs1RegLSB)) + gemminiIO.bits.rs2 := Mux(ciscValid, ciscInst.rs2, Cat(gemminiRs2RegMSB, gemminiRs2RegLSB)) + gemminiIO.valid := ciscValid || regValid assert(gemminiIO.ready || !gemminiIO.valid) accSlave.status := RegNext(outer.gemmini.module.io.busy).asUInt diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 1eb67dc..5c6ed03 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -5,17 +5,17 @@ package radiance.tile import chisel3._ import chisel3.util._ -import org.chipsalliance.diplomacy._ -import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.diplomacy.{AddressSet, BufferParams, ClockCrossingType, TransferSizes} +import org.chipsalliance.diplomacy.lazymodule._ import freechips.rocketchip.prci.ClockSinkParameters import freechips.rocketchip.subsystem._ -import freechips.rocketchip.tile.TraceBundle import freechips.rocketchip.tilelink._ import gemmini._ import midas.targetutils.SynthesizePrintf import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.diplomacy.{DisableMonitors, ValName} import radiance.memory._ -import radiance.subsystem.RadianceSharedMemKey +import radiance.subsystem.{RadianceFrameBufferKey, RadianceSharedMemKey} case class RadianceClusterParams( val clusterId: Int, @@ -85,6 +85,7 @@ class RadianceCluster ( assert(gemminiConfig.sp_width / 8 == smem_width) assert(gemminiConfig.sp_bank_entries == smem_depth) + VecInit(Seq(0.U, 1.U)).reduceTree(_ +& _) val stride_by_word = true val filter_aligned = true val disable_monitors = true // otherwise it generate 1k+ different tl monitors @@ -337,9 +338,13 @@ class RadianceCluster ( val traceTLNode = TLAdapterNode(clientFn = c => c, managerFn = m => m) // printf and perf counter buffer - TLRAM(AddressSet(x"ff000000" + smem_size, numCores * 0x200 - 1)) := traceTLNode := + TLRAM(AddressSet(smem_key.address + smem_size, numCores * 0x200 - 1)) := traceTLNode := TLBuffer() := TLFragmenter(4, 4) := clbus.outwardNode + p(RadianceFrameBufferKey).foreach { key => + val fb = LazyModule(new FrameBuffer(key.baseAddress, key.width, key.size, key.validAddress, key.fbName)) + fb.node := TLBuffer() := TLFragmenter(4, 4) := clbus.outwardNode + } // Diplomacy sink nodes for cluster-wide barrier sync signal val barrierSlaveNode = BarrierSlaveNode(numCores) @@ -371,7 +376,7 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( // @cleanup: This assumes barrier params on all edges are the same, i.e. all // cores are configured to have the same barrier id range. While true, might // be better to actually assert this - val barrierParam = outer.barrierSlaveNode.in(0)._2 + val barrierParam = outer.barrierSlaveNode.in.head._2 println(s"======= barrierParam: ${barrierParam}") val synchronizer = Module(new BarrierSynchronizer(barrierParam)) (synchronizer.io.reqs zip outer.barrierSlaveNode.in).foreach { case (req, (b, _)) => @@ -401,6 +406,7 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( } } + // TODO: remove Pipeline dependency of gemmini def makeSmemBanks(): Unit = { def make_buffer[T <: Data](mem: TwoPortSyncMem[T], r_node: TLBundle, r_edge: TLEdgeIn, diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index e277d60..6b1e4df 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -16,6 +16,7 @@ import freechips.rocketchip.subsystem.HierarchicalElementCrossingParamsLike import freechips.rocketchip.tile._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.util._ +import midas.targetutils.SynthesizePrintf import org.chipsalliance.cde.config._ import radiance.memory._ import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceSimArgs} @@ -477,6 +478,10 @@ class RadianceTileModuleImp(outer: RadianceTile) outer.decodeCoreInterrupts(core.io.interrupts) // Decode the interrupt vector + when (core.io.interrupts.msip && !RegNext(core.io.interrupts.msip)) { + SynthesizePrintf(printf("interrupt\n")) + } + core.io.interrupts.nmi.foreach { nmi => nmi := outer.nmiSinkNode.get.bundle } // Pass through various external constants and reports that were bundle-bridged into the tile diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index 8f72317..83e577a 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -39,7 +39,7 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl // val hartid = Input(UInt(tileIdLen.W)) val reset_vector = Input(UInt(resetVectorLen.W)) val interrupts = Input(new freechips.rocketchip.rocket.CoreInterrupts(false/*hasBeu*/)) - + // conditionally instantiate ports depending on whether we want to use VX_cache or not // TODO: flatten this like dmem and smem val imem = if (!tile.radianceParams.useVxCache) Some(Vec(1, new Bundle {