From 688755ef820c017207825c3ecfb49efa1ab9b5f7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 21 Mar 2024 13:19:53 -0700 Subject: [PATCH 1/8] Add debug delay to BarrierSynchronizer --- src/main/scala/radiance/tile/Barrier.scala | 38 ++++++++++++++-------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/main/scala/radiance/tile/Barrier.scala b/src/main/scala/radiance/tile/Barrier.scala index e3e965b..25adafe 100644 --- a/src/main/scala/radiance/tile/Barrier.scala +++ b/src/main/scala/radiance/tile/Barrier.scala @@ -53,10 +53,17 @@ case class BarrierMasterNode(val srcParams: BarrierParams)(implicit valName: Val case class BarrierSlaveNode(val numEdges: Int)(implicit valName: ValName) extends SinkNode(BarrierNodeImp)(Seq.fill(numEdges)(EmptyParams())) -class BarrierSynchronizer(param: BarrierParams) extends Module { - val numBarrierIds = 1 << param.barrierIdBits +// `delay`: number of cycles used to delay the response after all cores are +// synchronized. This is used for debugging purposes to give some time for the +// cores to "settle" after the barrier synchronization, e.g. resolve +// outstanding smem requests. +class BarrierSynchronizer( + param: BarrierParams, + delay: Option[Int] = None +) extends Module { + val numBarriers = 1 << param.barrierIdBits val numCores = 1 << param.numCoreBits - println(s"numBarrierIds: ${numBarrierIds}, numCores: ${numCores}") + println(s"numBarriers: ${numBarriers}, numCores: ${numCores}") val io = IO(new Bundle { val reqs = Vec(numCores, Flipped(Decoupled(new BarrierRequestBits(param)))) @@ -64,12 +71,15 @@ class BarrierSynchronizer(param: BarrierParams) extends Module { }) // 2-dimensional table of per-id, per-core "done" signals - val table = RegInit(VecInit(Seq.fill(numBarrierIds)(VecInit(Seq.fill(numCores)(false.B))))) - val done = Wire(Vec(numBarrierIds, Bool())) - table.zipWithIndex.foreach { case (row, i) => - done(i) := row.reduce(_ && _) + val table = RegInit(VecInit(Seq.fill(numBarriers)(VecInit(Seq.fill(numCores)(false.B))))) + val done = Seq.fill(numBarriers)(Wire(Bool())) + val delayer = delay.map(n => Seq.fill(numBarriers)(Counter(n))) + + (table zip done).zipWithIndex.foreach { case ((row, d), i) => + d := row.reduce(_ && _) + delayer.foreach{ dl => when (d) { dl(i).inc() }} + dontTouch(d) } - dontTouch(done) io.reqs.zipWithIndex.foreach { case (req, coreId) => // always ready; all this module does is latch to boolean regs @@ -81,18 +91,20 @@ class BarrierSynchronizer(param: BarrierParams) extends Module { } } - val doneArbiter = Module(new RRArbiter(Bool(), numBarrierIds)) + val doneArbiter = Module(new RRArbiter(Bool(), numBarriers)) (doneArbiter.io.in zip done).zipWithIndex.foreach { case ((in, d), i) => - in.valid := d + val alarm = delayer match { + case Some(dl) => dl(i).value === (dl(i).n - 1).U + case None => true.B + } + in.valid := (d && alarm) in.bits := d when(in.fire) { table(i).foreach(_ := false.B) + delayer.foreach(_(i).reset()) } } io.resp.valid := doneArbiter.io.out.valid io.resp.bits.barrierId := doneArbiter.io.chosen - when(io.resp.fire) { - table(io.resp.bits.barrierId).foreach(_ := false.B) - } doneArbiter.io.out.ready := io.resp.ready } From 0e9cb884a81e4775efb5a23e0472444fdb7586b6 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 21 Mar 2024 13:20:45 -0700 Subject: [PATCH 2/8] Remove software-based barrier MMIO --- .../scala/radiance/tile/RadianceCluster.scala | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 7d6752c..208039e 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -110,41 +110,5 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( b.resp <> synchronizer.io.resp // broadcast } - // outer.barrierSlaveNode.in.foreach { case (b, e) => - // val fakeBarrierRespId = RegNext(b.req.bits.barrierId) - // val fakeBarrierRespValid = RegNext(b.req.fire) - // b.req.ready := true.B // barrier module is always ready - // b.resp.valid := fakeBarrierRespValid - // b.resp.bits.barrierId := fakeBarrierRespId - // } - - val allSyncedRegs = Seq.fill(numBarriers)(Wire(UInt(32.W))) - val perCoreSyncedRegs = Seq.fill(numBarriers)(Seq.fill(outer.numCores)(RegInit(0.U(32.W)))) - (allSyncedRegs zip perCoreSyncedRegs).foreach{ case (all, per) => - all := per.reduce((x0, x1) => (x0 =/= 0.U) && (x1 =/= 0.U)) - - val allPassed = per.map(_ === 2.U).reduce(_ && _) - when(allPassed) { - per.foreach(_ := 0.U) - } - - dontTouch(all) - } - // FIXME: 4 cores per cluster hardcoded - outer.regNode.regmap( - 0x00 -> Seq(RegField.r(32, allSyncedRegs(0))), - 0x04 -> Seq(RegField(32, perCoreSyncedRegs(0)(0))), - 0x08 -> Seq(RegField(32, perCoreSyncedRegs(0)(1))), - 0x10 -> Seq(RegField.r(32, allSyncedRegs(1))), - 0x14 -> Seq(RegField(32, perCoreSyncedRegs(1)(0))), - 0x18 -> Seq(RegField(32, perCoreSyncedRegs(1)(1))), - 0x20 -> Seq(RegField.r(32, allSyncedRegs(2))), - 0x24 -> Seq(RegField(32, perCoreSyncedRegs(2)(0))), - 0x28 -> Seq(RegField(32, perCoreSyncedRegs(2)(1))), - 0x30 -> Seq(RegField.r(32, allSyncedRegs(3))), - 0x34 -> Seq(RegField(32, perCoreSyncedRegs(3)(0))), - 0x38 -> Seq(RegField(32, perCoreSyncedRegs(3)(1))), - ) - println(s"======== barrierSlaveNode: ${outer.barrierSlaveNode.in(0)._2.barrierIdBits}") } From 7258d69ce8fbe4e27df573fbe19f1461217248fb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 21 Mar 2024 13:21:13 -0700 Subject: [PATCH 3/8] Parameterize barrier params better Some numbers still hardcoded at client side, need to do proper diplomacy negotiation --- src/main/scala/radiance/tile/RadianceCluster.scala | 2 -- src/main/scala/radiance/tile/RadianceTile.scala | 6 +++++- src/main/scala/radiance/tile/VortexCore.scala | 11 ++++------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 208039e..f45607f 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -98,8 +98,6 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( println(s"======= RadianceCluster: clbus name = ${outer.clbus.busName}") } - val numBarriers = 4 // FIXME: hardcoded - // @cleanup: This assumes barrier params on all edges are the same, i.e. all // cores are configured to have the same barrier id range. While true, might // be better to actually assert this diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 59c6fed..7d323ee 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -328,7 +328,11 @@ class RadianceTile private ( // Barrier synchronization node // FIXME: hardcoded params - val barrierParams = BarrierParams(barrierIdBits = 2, numCoreBits = 1) + val numBarriers = 8 + val numCores = 2 + def barrierIdBits = log2Ceil(numBarriers) + def coreIdBits = log2Ceil(numCores) + val barrierParams = BarrierParams(barrierIdBits = barrierIdBits, numCoreBits = coreIdBits) val barrierMasterNode = BarrierMasterNode(barrierParams) val base = p(GPUMemory()) match { diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index 7a06f7d..b9634aa 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -90,15 +90,13 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W)) // FIXME: hardcoded - val NB_WIDTH = 2 - val NC_WIDTH = 1 val gbar_req_valid = Output(Bool()) - val gbar_req_id = Output(UInt(NB_WIDTH.W)) - val gbar_req_size_m1 = Output(UInt(NC_WIDTH.W)) - val gbar_req_core_id = Output(UInt(NC_WIDTH.W)) + val gbar_req_id = Output(UInt(tile.barrierIdBits.W)) + val gbar_req_size_m1 = Output(UInt(tile.coreIdBits.W)) + val gbar_req_core_id = Output(UInt(tile.coreIdBits.W)) val gbar_req_ready = Input(Bool()) val gbar_rsp_valid = Input(Bool()) - val gbar_rsp_id = Input(UInt(NB_WIDTH.W)) + val gbar_rsp_id = Input(UInt(tile.barrierIdBits.W)) // val fpu = Flipped(new FPUCoreIO()) //val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs)) @@ -116,7 +114,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // see VX_csr_data that implements the read logic for CSR_MHARTID/GWID. Map( "CORE_ID" -> tile.tileParams.tileId, - "CORES_PER_CLUSTER" -> 2, // FIXME: hardcoded // TODO: can we get this as a parameter? "BOOTROM_HANG100" -> 0x10100, "NUM_THREADS" -> tile.numLsuLanes From 16fdc65e5307addbdc0979ee9b725a2c6de008d2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 21 Mar 2024 13:23:10 -0700 Subject: [PATCH 4/8] Add cluster-specific Verilog preproc flags to radiance.mk --- radiance.mk | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/radiance.mk b/radiance.mk index 0f0e29a..48dd596 100644 --- a/radiance.mk +++ b/radiance.mk @@ -19,7 +19,13 @@ EXTRA_SIM_PREPROC_DEFINES += \ +define+DBG_TRACE_CORE_PIPELINE_VCS \ +define+PERF_ENABLE \ +define+ICACHE_DISABLE +define+DCACHE_DISABLE \ - +define+NUM_THREADS=8 +define+NUM_WARPS=8 + +define+GBAR_ENABLE \ + +define+GBAR_CLUSTER_ENABLE \ + +define+NUM_BARRIERS=8 \ + +define+NUM_CORES=2 +define+NUM_THREADS=8 +define+NUM_WARPS=8 + # Can't increase this to above 14, since the binary accesses 0xff0040.. + # which is unmapped to any memory + # +define+SMEM_LOG_SIZE=14 \ # cargo handles building of Rust files all on its own, so make this a PHONY # target to run cargo unconditionally From 20a33e5a4040a87f02e76ecd2e33881b9ad4fd77 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 21 Mar 2024 15:45:56 -0700 Subject: [PATCH 5/8] Bump vortex --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index df4b215..3718a57 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit df4b21507eae6fe6ee2003e7a6b79e0a7826eac4 +Subproject commit 3718a579370807dfb57980ec2c45491d0138133d From c28f0510d8a7beeed323cd60d449eb6b1e23c0a2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 22 Mar 2024 09:53:18 -0700 Subject: [PATCH 6/8] Change vortex submodule remote --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 9b50b6b..08817f3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "src/main/resources/vsrc/vortex"] path = src/main/resources/vsrc/vortex - url = https://github.com/hansungk/vortex + url = https://github.com/hansungk/vortex-private [submodule "radpie"] path = radpie url = git@github.com:hansungk/radpie.git From 54b64aba07f106816241c8409183d2f7f9a1bcb7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 22 Mar 2024 19:47:26 -0700 Subject: [PATCH 7/8] Use ssh for vortex remote path --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 08817f3..d49652c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "src/main/resources/vsrc/vortex"] path = src/main/resources/vsrc/vortex - url = https://github.com/hansungk/vortex-private + url = git@github.com:hansungk/vortex-private.git [submodule "radpie"] path = radpie url = git@github.com:hansungk/radpie.git From 215ac369cbe39c0f65369eacda1517af589630fb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 23 Mar 2024 13:48:44 -0700 Subject: [PATCH 8/8] Do proper barrier param negotiation for numCores --- src/main/scala/radiance/tile/Barrier.scala | 48 ++++++++++++------- .../scala/radiance/tile/RadianceCluster.scala | 1 + .../scala/radiance/tile/RadianceTile.scala | 21 +------- src/main/scala/radiance/tile/VortexCore.scala | 10 ++-- 4 files changed, 38 insertions(+), 42 deletions(-) diff --git a/src/main/scala/radiance/tile/Barrier.scala b/src/main/scala/radiance/tile/Barrier.scala index 25adafe..269a62c 100644 --- a/src/main/scala/radiance/tile/Barrier.scala +++ b/src/main/scala/radiance/tile/Barrier.scala @@ -14,12 +14,12 @@ import freechips.rocketchip.diplomacy._ case class EmptyParams() case class BarrierParams( - barrierIdBits: Int, - numCoreBits: Int + barrierIdBits: Int, + numCoreBits: Int ) class BarrierRequestBits( - param: BarrierParams + param: BarrierParams ) extends Bundle { val barrierId = UInt(param.barrierIdBits.W) val sizeMinusOne = UInt(param.numCoreBits.W) @@ -27,7 +27,7 @@ class BarrierRequestBits( } class BarrierResponseBits( - param: BarrierParams + param: BarrierParams ) extends Bundle { val barrierId = UInt(param.barrierIdBits.W) } @@ -38,28 +38,38 @@ class BarrierBundle(param: BarrierParams) extends Bundle { } // FIXME Separate BarrierEdgeParams from BarrierParams -object BarrierNodeImp extends SimpleNodeImp[BarrierParams, EmptyParams, BarrierParams, BarrierBundle] { - def edge(pd: BarrierParams, pu: EmptyParams, p: Parameters, sourceInfo: SourceInfo) = { - // barrier parameters flow strictly downward from the master node - pd +object BarrierNodeImp extends SimpleNodeImp[BarrierParams, BarrierParams, BarrierParams, BarrierBundle] { + def edge(pd: BarrierParams, pu: BarrierParams, p: Parameters, sourceInfo: SourceInfo) = { + println(s"==== BarrierNodeImp: barrierIdBits=${pd.barrierIdBits}, numCoreBits=${pu.numCoreBits}") + require(pd.barrierIdBits >= 0 && pu.numCoreBits >= 0) + BarrierParams(barrierIdBits = pd.barrierIdBits, numCoreBits = pu.numCoreBits) } def bundle(e: BarrierParams) = new BarrierBundle(e) // FIXME render def render(e: BarrierParams) = RenderedEdge(colour = "ffffff", label = "X") } -case class BarrierMasterNode(val srcParams: BarrierParams)(implicit valName: ValName) - extends SourceNode(BarrierNodeImp)(Seq(srcParams)) -case class BarrierSlaveNode(val numEdges: Int)(implicit valName: ValName) - extends SinkNode(BarrierNodeImp)(Seq.fill(numEdges)(EmptyParams())) +case class BarrierMasterNode(val barrierIdBits: Int)(implicit valName: ValName) + extends SourceNode(BarrierNodeImp)({ + require(barrierIdBits >= 0) + Seq(BarrierParams(barrierIdBits = barrierIdBits, numCoreBits = -1 /* unset */)) + }) +case class BarrierSlaveNode(val numCores: Int)(implicit valName: ValName) + extends SinkNode(BarrierNodeImp)({ + require(numCores > 0) + val numCoreBits = log2Ceil(numCores) + Seq.fill(numCores)( + BarrierParams(barrierIdBits = -1 /* unset */, numCoreBits = numCoreBits) + ) + }) // `delay`: number of cycles used to delay the response after all cores are // synchronized. This is used for debugging purposes to give some time for the // cores to "settle" after the barrier synchronization, e.g. resolve // outstanding smem requests. class BarrierSynchronizer( - param: BarrierParams, - delay: Option[Int] = None + param: BarrierParams, + delay: Option[Int] = None ) extends Module { val numBarriers = 1 << param.barrierIdBits val numCores = 1 << param.numCoreBits @@ -71,13 +81,15 @@ class BarrierSynchronizer( }) // 2-dimensional table of per-id, per-core "done" signals - val table = RegInit(VecInit(Seq.fill(numBarriers)(VecInit(Seq.fill(numCores)(false.B))))) + val table = RegInit( + VecInit(Seq.fill(numBarriers)(VecInit(Seq.fill(numCores)(false.B)))) + ) val done = Seq.fill(numBarriers)(Wire(Bool())) val delayer = delay.map(n => Seq.fill(numBarriers)(Counter(n))) (table zip done).zipWithIndex.foreach { case ((row, d), i) => d := row.reduce(_ && _) - delayer.foreach{ dl => when (d) { dl(i).inc() }} + delayer.foreach { dl => when(d) { dl(i).inc() } } dontTouch(d) } @@ -86,7 +98,7 @@ class BarrierSynchronizer( req.ready := true.B when(req.fire) { assert(coreId.U === req.bits.coreId) - // FIXME: don't need coreId to be hardware here + // @cleanup: coreId don't need to be hardware table(req.bits.barrierId)(coreId.U) := true.B } } @@ -95,7 +107,7 @@ class BarrierSynchronizer( (doneArbiter.io.in zip done).zipWithIndex.foreach { case ((in, d), i) => val alarm = delayer match { case Some(dl) => dl(i).value === (dl(i).n - 1).U - case None => true.B + case None => true.B } in.valid := (d && alarm) in.bits := d diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index f45607f..01584d3 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -102,6 +102,7 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( // cores are configured to have the same barrier id range. While true, might // be better to actually assert this val barrierParam = outer.barrierSlaveNode.in(0)._2 + println(s"======= barrierParam: ${barrierParam}") val synchronizer = Module(new BarrierSynchronizer(barrierParam)) (synchronizer.io.reqs zip outer.barrierSlaveNode.in).foreach { case (req, (b, _)) => req <> b.req diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 7d323ee..defb3f0 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -329,11 +329,8 @@ class RadianceTile private ( // Barrier synchronization node // FIXME: hardcoded params val numBarriers = 8 - val numCores = 2 def barrierIdBits = log2Ceil(numBarriers) - def coreIdBits = log2Ceil(numCores) - val barrierParams = BarrierParams(barrierIdBits = barrierIdBits, numCoreBits = coreIdBits) - val barrierMasterNode = BarrierMasterNode(barrierParams) + val barrierMasterNode = BarrierMasterNode(barrierIdBits) val base = p(GPUMemory()) match { case Some(GPUMemParams(baseAddr, _)) => baseAddr @@ -786,22 +783,6 @@ class RadianceTileModuleImp(outer: RadianceTile) // } } -class ClusterSynchronizer( - barrierIdWidth: Int, - numCoreWidth: Int, -) extends Module { - val io = IO(new Bundle { - val req = Flipped(Decoupled(new Bundle { - val barrierId = UInt(barrierIdWidth.W) - val sizeMinusOne = UInt(numCoreWidth.W) - val coreId = UInt(numCoreWidth.W) - })) - val resp = Decoupled(new Bundle { - val barrierId = UInt(barrierIdWidth.W) - }) - }) -} - // Some @copypaste from CoalescerSourceGen. class VortexTLAdapter( newSourceWidth: Int, diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index b9634aa..ab2e1ef 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -90,13 +90,15 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W)) // FIXME: hardcoded + val barrierIdBits = tile.barrierMasterNode.out(0)._2.barrierIdBits + val coreIdBits = tile.barrierMasterNode.out(0)._2.numCoreBits val gbar_req_valid = Output(Bool()) - val gbar_req_id = Output(UInt(tile.barrierIdBits.W)) - val gbar_req_size_m1 = Output(UInt(tile.coreIdBits.W)) - val gbar_req_core_id = Output(UInt(tile.coreIdBits.W)) + val gbar_req_id = Output(UInt(barrierIdBits.W)) + val gbar_req_size_m1 = Output(UInt(coreIdBits.W)) + val gbar_req_core_id = Output(UInt(coreIdBits.W)) val gbar_req_ready = Input(Bool()) val gbar_rsp_valid = Input(Bool()) - val gbar_rsp_id = Input(UInt(tile.barrierIdBits.W)) + val gbar_rsp_id = Input(UInt(barrierIdBits.W)) // val fpu = Flipped(new FPUCoreIO()) //val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs))