From 447977bd89c04fd894135076741aefc868c73f39 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 15:02:08 -0700 Subject: [PATCH 01/47] addResource hopper tensor core --- src/main/scala/radiance/tile/VortexCore.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index d202aaa..d6561e3 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -396,6 +396,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // tensor core addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv") + addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv") // addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh") addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv") From 6a3aa549d34ffbafd4154081fb063be847452114 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 15:02:25 -0700 Subject: [PATCH 02/47] Add skeleton for Hopper Tensor Core --- .../radiance/core/TensorCoreDecoupled.scala | 70 +++++++++++++++++++ .../radiance/TensorCoreDecoupledTest.scala | 23 ++++++ 2 files changed, 93 insertions(+) create mode 100644 src/main/scala/radiance/core/TensorCoreDecoupled.scala create mode 100644 src/test/scala/radiance/TensorCoreDecoupledTest.scala diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala new file mode 100644 index 0000000..10bfedb --- /dev/null +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -0,0 +1,70 @@ +// See LICENSE.SiFive for license details. +// See LICENSE.Berkeley for license details. + +package radiance.core + +import chisel3._ +import chisel3.util._ + +class TensorCoreDecoupled(val numWarps: Int, val numLanes: Int) extends Module { + val numWarpBits = log2Ceil(numWarps) + val wordSize = 4 // TODO FP16 + val dataWidth = numLanes * wordSize // TODO FP16 + + val io = IO(new Bundle{ + val initiate = Flipped(Decoupled(new Bundle{ + val wid = UInt(numWarpBits.W) + })) + val dataA = Flipped(Decoupled(new TensorMemResp(dataWidth))) + val dataB = Flipped(Decoupled(new TensorMemResp(dataWidth))) + val addressA = Decoupled(new TensorMemReq) + val addressB = Decoupled(new TensorMemReq) + val writeback = Decoupled(new Bundle{ + val wid = UInt(numWarpBits.W) + val last = Bool() + }) + }) + + // FSM + // + val state = RegInit(TensorState.idle) + // TODO: just transition every cycle for now + state := (state match { + case TensorState.idle => Mux(io.initiate.fire, TensorState.smemRead, state) + case TensorState.smemRead => TensorState.compute + case TensorState.compute => TensorState.writeback + case TensorState.writeback => { + // hold until writeback is cleared + Mux(io.writeback.ready, TensorState.idle, state) + } + case _ => TensorState.idle + }) + + // TODO + io.dataA.ready := true.B + io.dataB.ready := true.B + io.addressA.valid := false.B + io.addressB.valid := false.B + io.addressA.bits := DontCare + io.addressB.bits := DontCare + io.initiate.ready := true.B + io.writeback.valid := true.B + io.writeback.bits := DontCare +} + +class TensorMemReq extends Bundle { + // TODO: tag + val address = UInt(32.W) +} +class TensorMemResp(val dataWidth: Int) extends Bundle { + // TODO: tag + val data = UInt(32.W) +} + + +object TensorState extends ChiselEnum { + val idle = Value(0.U) + val smemRead = Value(1.U) + val compute = Value(2.U) + val writeback = Value(3.U) +} diff --git a/src/test/scala/radiance/TensorCoreDecoupledTest.scala b/src/test/scala/radiance/TensorCoreDecoupledTest.scala new file mode 100644 index 0000000..5dd734a --- /dev/null +++ b/src/test/scala/radiance/TensorCoreDecoupledTest.scala @@ -0,0 +1,23 @@ +package radiance.core + +import chisel3._ +import chisel3.util._ +import chiseltest._ +import org.scalatest.flatspec.AnyFlatSpec + +class TensorCoreDecoupledTest extends AnyFlatSpec with ChiselScalatestTester { + behavior of "TensorCoreDecoupled" + + it should "do the right thing" in { + test(new TensorCoreDecoupled(8, 8)) + { c => + c.io.initiate.valid.poke(true.B) + c.io.dataA.valid.poke(false.B) + c.io.dataA.bits.data.poke(0.U) + c.io.dataB.valid.poke(false.B) + c.io.dataB.bits.data.poke(0.U) + c.clock.step() + c.io.writeback.valid.expect(true.B) + } + } +} From 327615e330a83558af4445b79510d656e24add0a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 17:28:51 -0700 Subject: [PATCH 03/47] Add state regs and init/writeback transition --- .../radiance/core/TensorCoreDecoupled.scala | 134 +++++++++++++----- .../radiance/TensorCoreDecoupledTest.scala | 13 +- 2 files changed, 109 insertions(+), 38 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 10bfedb..87657d5 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -6,50 +6,126 @@ package radiance.core import chisel3._ import chisel3.util._ -class TensorCoreDecoupled(val numWarps: Int, val numLanes: Int) extends Module { +case class TensorTilingParams( + // Dimension of the SMEM tile + m: Int = 16, + n: Int = 16, + k: Int = 16, + // Dimension of the compute tile. This is determined by the number of MAC + // units + mc: Int = 4, + nc: Int = 4, + kc: Int = 4 +) + +class TensorCoreDecoupled( + val numWarps: Int, + val numLanes: Int, + val tilingParams: TensorTilingParams +) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 val dataWidth = numLanes * wordSize // TODO FP16 - val io = IO(new Bundle{ - val initiate = Flipped(Decoupled(new Bundle{ + val io = IO(new Bundle { + val initiate = Flipped(Decoupled(new Bundle { val wid = UInt(numWarpBits.W) })) - val dataA = Flipped(Decoupled(new TensorMemResp(dataWidth))) - val dataB = Flipped(Decoupled(new TensorMemResp(dataWidth))) - val addressA = Decoupled(new TensorMemReq) - val addressB = Decoupled(new TensorMemReq) - val writeback = Decoupled(new Bundle{ + val writeback = Decoupled(new Bundle { val wid = UInt(numWarpBits.W) val last = Bool() }) + val respA = Flipped(Decoupled(new TensorMemResp(dataWidth))) + val respB = Flipped(Decoupled(new TensorMemResp(dataWidth))) + val reqA = Decoupled(new TensorMemReq) + val reqB = Decoupled(new TensorMemReq) }) // FSM - // + // --- + // This drives the overall pipeline of memory requests, dot-product unit + // operations and regfile writeback. + + object TensorState extends ChiselEnum { + val idle = Value(0.U) + val run = Value(1.U) + // All set/step sequencing is complete and the tensor core is holding the + // result data until downstream writeback is ready. + // FIXME: is this necessary if writeback is decoupled with queues? + val finish = Value(2.U) + } val state = RegInit(TensorState.idle) + val busy = RegInit(false.B) + // Holds the warp id the core is currently working on. Note that we only + // support one outstanding warp request + val warpReg = RegInit(0.U(numWarpBits.W)) + // TODO: just transition every cycle for now - state := (state match { - case TensorState.idle => Mux(io.initiate.fire, TensorState.smemRead, state) - case TensorState.smemRead => TensorState.compute - case TensorState.compute => TensorState.writeback - case TensorState.writeback => { + def nextState(state: TensorState.Type) = state match { + case TensorState.idle => Mux(io.initiate.fire, TensorState.run, state) + case TensorState.run => TensorState.finish + case TensorState.finish => { // hold until writeback is cleared Mux(io.writeback.ready, TensorState.idle, state) } case _ => TensorState.idle - }) + } + state := nextState(state) - // TODO - io.dataA.ready := true.B - io.dataB.ready := true.B - io.addressA.valid := false.B - io.addressB.valid := false.B - io.addressA.bits := DontCare - io.addressB.bits := DontCare - io.initiate.ready := true.B - io.writeback.valid := true.B - io.writeback.bits := DontCare + // state table for every warp id + // sets: k iteration + val numSets = (tilingParams.k / tilingParams.kc) + val setBits = log2Ceil(numSets) + // steps: i-j iteration + val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc) + val stepBits = log2Ceil(numSteps) + val setReg = RegInit(0.U(setBits.W)) + val stepReg = RegInit(0.U(setBits.W)) + // val tableRow = Valid(new Bundle { + // val set = UInt(setBits.W) + // val step = UInt(stepBits.W) + // }) + + when(io.initiate.fire) { + val wid = io.initiate.bits.wid + busy := true.B + warpReg := wid + setReg := 0.U + stepReg := 0.U + when(io.writeback.fire) { + assert(io.writeback.bits.wid =/= wid, + "unsupported concurrent initiate and writeback to the same warp") + } + } + when (io.writeback.fire) { + busy := false.B + } + + io.initiate.ready := !busy + + // Writeback queues + // ---------------- + // These queues hold the metadata necessary for register + // writeback. + + // val queueDepth = 2 + // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) + // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) + + // Output logic + // ------------ + + io.writeback.valid := (state === TensorState.finish) + io.writeback.bits.wid := warpReg + io.writeback.bits.last := false.B // TODO + + // FIXME + io.respA.ready := true.B + io.respB.ready := true.B + io.reqA.valid := false.B + io.reqB.valid := false.B + io.reqA.bits := DontCare + io.reqB.bits := DontCare } class TensorMemReq extends Bundle { @@ -60,11 +136,3 @@ class TensorMemResp(val dataWidth: Int) extends Bundle { // TODO: tag val data = UInt(32.W) } - - -object TensorState extends ChiselEnum { - val idle = Value(0.U) - val smemRead = Value(1.U) - val compute = Value(2.U) - val writeback = Value(3.U) -} diff --git a/src/test/scala/radiance/TensorCoreDecoupledTest.scala b/src/test/scala/radiance/TensorCoreDecoupledTest.scala index 5dd734a..b1e0e9a 100644 --- a/src/test/scala/radiance/TensorCoreDecoupledTest.scala +++ b/src/test/scala/radiance/TensorCoreDecoupledTest.scala @@ -9,13 +9,16 @@ class TensorCoreDecoupledTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "TensorCoreDecoupled" it should "do the right thing" in { - test(new TensorCoreDecoupled(8, 8)) + test(new TensorCoreDecoupled(8, 8, tilingParams = TensorTilingParams())) { c => c.io.initiate.valid.poke(true.B) - c.io.dataA.valid.poke(false.B) - c.io.dataA.bits.data.poke(0.U) - c.io.dataB.valid.poke(false.B) - c.io.dataB.bits.data.poke(0.U) + c.io.initiate.bits.wid.poke(0.U) + + c.io.respA.valid.poke(false.B) + c.io.respA.bits.data.poke(0.U) + c.io.respB.valid.poke(false.B) + c.io.respB.bits.data.poke(0.U) + c.clock.step() c.io.writeback.valid.expect(true.B) } From 3165108c8bad0f968278c82ede73f850ae4deaa0 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 19:47:00 -0700 Subject: [PATCH 04/47] Add synthesizable unit test for tensor --- .../radiance/core/TensorCoreDecoupled.scala | 23 +++++++++- .../UnitTest.scala => unittest/Configs.scala} | 46 ++++++++++++++++--- 2 files changed, 62 insertions(+), 7 deletions(-) rename src/main/scala/radiance/{memory/UnitTest.scala => unittest/Configs.scala} (64%) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 87657d5..4744266 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -5,6 +5,8 @@ package radiance.core import chisel3._ import chisel3.util._ +import org.chipsalliance.cde.config.Parameters +import freechips.rocketchip.unittest.UnitTest case class TensorTilingParams( // Dimension of the SMEM tile @@ -62,7 +64,7 @@ class TensorCoreDecoupled( // TODO: just transition every cycle for now def nextState(state: TensorState.Type) = state match { - case TensorState.idle => Mux(io.initiate.fire, TensorState.run, state) + case TensorState.idle => Mux(io.initiate.fire, TensorState.run, state) case TensorState.run => TensorState.finish case TensorState.finish => { // hold until writeback is cleared @@ -136,3 +138,22 @@ class TensorMemResp(val dataWidth: Int) extends Bundle { // TODO: tag val data = UInt(32.W) } + +// synthesizable unit tests + +class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters) + extends UnitTest(timeout) { + val dut = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams())) + dut.io.initiate.valid := io.start + dut.io.initiate.bits.wid := 0.U + // TODO + dut.io.respA.valid := false.B + dut.io.respA.bits := DontCare + dut.io.respB.valid := false.B + dut.io.respB.bits := DontCare + dut.io.reqA.ready := true.B + dut.io.reqB.ready := true.B + dut.io.writeback.ready := true.B + + io.finished := dut.io.writeback.valid +} diff --git a/src/main/scala/radiance/memory/UnitTest.scala b/src/main/scala/radiance/unittest/Configs.scala similarity index 64% rename from src/main/scala/radiance/memory/UnitTest.scala rename to src/main/scala/radiance/unittest/Configs.scala index c070ef4..065045c 100644 --- a/src/main/scala/radiance/memory/UnitTest.scala +++ b/src/main/scala/radiance/unittest/Configs.scala @@ -1,6 +1,6 @@ // See LICENSE.SiFive for license details. -package radiance.memory +package radiance.unittest import chisel3._ import org.chipsalliance.cde.config._ @@ -8,6 +8,8 @@ import freechips.rocketchip.subsystem.{BaseSubsystemConfig} import freechips.rocketchip.devices.tilelink._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.util._ +import radiance.core.TensorCoreDecoupledTest +import radiance.memory._ import radiance.subsystem.WithSimtConfig import freechips.rocketchip.unittest._ //import rocket.VortexFatBankTest @@ -17,6 +19,16 @@ case object TestDurationMultiplier extends Field[Int] class WithTestDuration(x: Int) extends Config((site, here, up) => { case TestDurationMultiplier => x }) + +class WithTensorUnitTests extends Config((site, _, _) => { + case UnitTests => (q: Parameters) => { + implicit val p = q + val timeout = 50000 * site(TestDurationMultiplier) + Seq( + Module(new TensorCoreDecoupledTest(timeout=timeout)), + ) } +}) + class WithCoalescingUnitTests extends Config((site, _, _) => { case UnitTests => (q: Parameters) => { implicit val p = q @@ -52,12 +64,34 @@ class WithCoalescingUnitSynthesisDummy(nLanes: Int) extends Config((site, _, _) ) } }) -class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nMemLanes=4) ++ new BaseSubsystemConfig) +class TensorUnitTestConfig extends Config( + new WithTensorUnitTests ++ + new WithTestDuration(10) ++ + new BaseSubsystemConfig) + +class CoalescingUnitTestConfig extends Config( + new WithCoalescingUnitTests ++ + new WithTestDuration(10) ++ + new WithSimtConfig(nMemLanes=4) ++ + new BaseSubsystemConfig) + //class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nLanes=4) ++ new BaseSubsystemConfig) // Dummy configs of various sizes for synthesis -class CoalescingSynthesisDummyLane4Config extends Config(new WithCoalescingUnitSynthesisDummy(4) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig) -class CoalescingSynthesisDummyLane8Config extends Config(new WithCoalescingUnitSynthesisDummy(8) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig) -class CoalescingSynthesisDummyLane16Config extends Config(new WithCoalescingUnitSynthesisDummy(16) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig) -class CoalescingSynthesisDummyLane32Config extends Config(new WithCoalescingUnitSynthesisDummy(32) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig) +class CoalescingSynthesisDummyLane4Config extends Config( + new WithCoalescingUnitSynthesisDummy(4) ++ + new WithTestDuration(10) ++ + new BaseSubsystemConfig) +class CoalescingSynthesisDummyLane8Config extends Config( + new WithCoalescingUnitSynthesisDummy(8) ++ + new WithTestDuration(10) ++ + new BaseSubsystemConfig) +class CoalescingSynthesisDummyLane16Config extends Config( + new WithCoalescingUnitSynthesisDummy(16) ++ + new WithTestDuration(10) ++ + new BaseSubsystemConfig) +class CoalescingSynthesisDummyLane32Config extends Config( + new WithCoalescingUnitSynthesisDummy(32) ++ + new WithTestDuration(10) ++ + new BaseSubsystemConfig) From 01f53a8be1b2ce9a46a5e7941b21feca2e20df8f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 20:20:30 -0700 Subject: [PATCH 05/47] tensor: Sequence through set/steps --- .../radiance/core/TensorCoreDecoupled.scala | 78 +++++++++++-------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 4744266..935ed40 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -42,6 +42,7 @@ class TensorCoreDecoupled( val reqA = Decoupled(new TensorMemReq) val reqB = Decoupled(new TensorMemReq) }) + dontTouch(io) // FSM // --- @@ -62,48 +63,70 @@ class TensorCoreDecoupled( // support one outstanding warp request val warpReg = RegInit(0.U(numWarpBits.W)) - // TODO: just transition every cycle for now - def nextState(state: TensorState.Type) = state match { - case TensorState.idle => Mux(io.initiate.fire, TensorState.run, state) - case TensorState.run => TensorState.finish - case TensorState.finish => { - // hold until writeback is cleared - Mux(io.writeback.ready, TensorState.idle, state) - } - case _ => TensorState.idle - } - state := nextState(state) - - // state table for every warp id // sets: k iteration val numSets = (tilingParams.k / tilingParams.kc) val setBits = log2Ceil(numSets) // steps: i-j iteration val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc) val stepBits = log2Ceil(numSteps) - val setReg = RegInit(0.U(setBits.W)) - val stepReg = RegInit(0.U(setBits.W)) - // val tableRow = Valid(new Bundle { - // val set = UInt(setBits.W) - // val step = UInt(stepBits.W) - // }) + val set = RegInit(0.U(setBits.W)) + val step = RegInit(0.U(stepBits.W)) when(io.initiate.fire) { val wid = io.initiate.bits.wid busy := true.B warpReg := wid - setReg := 0.U - stepReg := 0.U + set := 0.U + step := 0.U when(io.writeback.fire) { - assert(io.writeback.bits.wid =/= wid, - "unsupported concurrent initiate and writeback to the same warp") + assert( + io.writeback.bits.wid =/= wid, + "unsupported concurrent initiate and writeback to the same warp" + ) } } - when (io.writeback.fire) { + when(io.writeback.fire) { busy := false.B } + // set/step sequencing logic + val nextStep = true.B // TODO + val lastSet = ((1 << setBits) - 1) + val lastStep = ((1 << stepBits) - 1) + val setDone = (set === lastSet.U) + val stepDone = (step === lastStep.U) + when (nextStep) { + step := (step + 1.U) & lastStep.U + when (stepDone) { + set := (set + 1.U) & lastSet.U + } + } + + // state transition logic + switch(state) { + is(TensorState.idle) { + when(io.initiate.fire) { + state := TensorState.run + } + } + is(TensorState.run) { + when (setDone && stepDone && nextStep) { + when (state === TensorState.run) { + state := TensorState.finish + } + } + } + is(TensorState.finish) { + when(io.writeback.fire) { + state := TensorState.idle + } + } + } + io.initiate.ready := !busy + io.writeback.valid := (state === TensorState.finish) + io.writeback.bits.wid := warpReg + io.writeback.bits.last := false.B // TODO // Writeback queues // ---------------- @@ -114,13 +137,6 @@ class TensorCoreDecoupled( // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) - // Output logic - // ------------ - - io.writeback.valid := (state === TensorState.finish) - io.writeback.bits.wid := warpReg - io.writeback.bits.last := false.B // TODO - // FIXME io.respA.ready := true.B io.respB.ready := true.B From 9ac8f2492cf89d73c3ac8f8612bb4d03d9871e35 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 20:54:24 -0700 Subject: [PATCH 06/47] tensor: Minimal diplomacy config for unittest --- .../radiance/core/TensorCoreDecoupled.scala | 80 +++++++++++++++---- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 935ed40..67d1c31 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -6,7 +6,10 @@ package radiance.core import chisel3._ import chisel3.util._ import org.chipsalliance.cde.config.Parameters -import freechips.rocketchip.unittest.UnitTest +import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.diplomacy.AddressSet +import freechips.rocketchip.unittest.{UnitTest, UnitTestModule} case class TensorTilingParams( // Dimension of the SMEM tile @@ -157,19 +160,68 @@ class TensorMemResp(val dataWidth: Int) extends Bundle { // synthesizable unit tests +// wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy +// network. +class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule { + val node = TLClientNode(Seq(TLMasterPortParameters.v2( + Seq(TLMasterParameters.v2( + name = "TensorCoreDecoupledClientNode", + // sourceId : TODO + )) + ))) + + lazy val module = new TensorCoreDecoupledTLImp(this) +} + +class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) + extends LazyModuleImp(outer) with UnitTestModule { + val tensor = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams())) + + require(outer.node.out.length == 1) + + val (tlOut, edge) = outer.node.out(0) + tlOut.a.valid := tensor.io.reqA.valid + tlOut.a.bits.address := tensor.io.reqA.bits.address + tlOut.a.bits.source := 0.U // TODO: tensor.io.reqA.bits.source + tensor.io.respA.valid := tlOut.d.valid + tensor.io.respA.bits.data := tlOut.d.bits.data + // TODO: tensor.io.respA.bits.source := tlOut.d.bits.source + + tensor.io.initiate.valid := io.start + tensor.io.initiate.bits.wid := 0.U + // TODO + tensor.io.respA.valid := false.B + tensor.io.respA.bits := DontCare + tensor.io.respB.valid := false.B + tensor.io.respB.bits := DontCare + tensor.io.reqA.ready := true.B + tensor.io.reqB.ready := true.B + tensor.io.writeback.ready := true.B + + io.finished := tensor.io.writeback.valid +} + +// a minimal Diplomacy graph with a tensor core and a TLRAM +class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule { + val tensor = LazyModule(new TensorCoreDecoupledTL) + val ram = LazyModule(new TLRAM( + address = AddressSet(0x0000, 0xffffff), + beatBytes = 32 // FIXME: hardcoded + )) + + ram.node :=* tensor.node + + lazy val module = new Impl + class Impl extends LazyModuleImp(this) with UnitTestModule { + tensor.module.io.start := io.start + io.finished := tensor.module.io.finished + } +} + +// unit test harness class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams())) - dut.io.initiate.valid := io.start - dut.io.initiate.bits.wid := 0.U - // TODO - dut.io.respA.valid := false.B - dut.io.respA.bits := DontCare - dut.io.respB.valid := false.B - dut.io.respB.bits := DontCare - dut.io.reqA.ready := true.B - dut.io.reqB.ready := true.B - dut.io.writeback.ready := true.B - - io.finished := dut.io.writeback.valid + val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module) + dut.io.start := io.start + io.finished := dut.io.finished } From bf6f7210b7cf591c118c779727fce73142c21be6 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 21:14:51 -0700 Subject: [PATCH 07/47] tensor: Generate TL traffic, separate edges for A and B --- .../radiance/core/TensorCoreDecoupled.scala | 66 +++++++++++++------ 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 67d1c31..e61b542 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -101,10 +101,19 @@ class TensorCoreDecoupled( when (nextStep) { step := (step + 1.U) & lastStep.U when (stepDone) { - set := (set + 1.U) & lastSet.U + set := (set + 1.U) & lastSet.U } } + // memory traffic generation + io.reqA.valid := (state === TensorState.run) // FIXME + io.reqA.bits.address := 0.U // FIXME + io.respA.ready := true.B + io.respB.ready := true.B + // FIXME + io.reqB.valid := false.B + io.reqB.bits := DontCare + // state transition logic switch(state) { is(TensorState.idle) { @@ -139,14 +148,6 @@ class TensorCoreDecoupled( // val queueDepth = 2 // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) - - // FIXME - io.respA.ready := true.B - io.respB.ready := true.B - io.reqA.valid := false.B - io.reqB.valid := false.B - io.reqA.bits := DontCare - io.reqB.bits := DontCare } class TensorMemReq extends Bundle { @@ -163,12 +164,21 @@ class TensorMemResp(val dataWidth: Int) extends Bundle { // wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy // network. class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule { - val node = TLClientNode(Seq(TLMasterPortParameters.v2( - Seq(TLMasterParameters.v2( - name = "TensorCoreDecoupledClientNode", - // sourceId : TODO - )) - ))) + // node with two edges; one for A and one for B matrix + val node = TLClientNode(Seq( + TLMasterPortParameters.v2( + Seq(TLMasterParameters.v2( + name = "TensorCoreDecoupledMatrixANode", + // sourceId : TODO + )) + ), + TLMasterPortParameters.v2( + Seq(TLMasterParameters.v2( + name = "TensorCoreDecoupledMatrixBNode", + // sourceId : TODO + )) + ) + )) lazy val module = new TensorCoreDecoupledTLImp(this) } @@ -176,13 +186,28 @@ class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule { class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) extends LazyModuleImp(outer) with UnitTestModule { val tensor = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams())) + val wordSize = 4 // FIXME: hardcoded - require(outer.node.out.length == 1) + require(outer.node.out.length == 2/*A and B*/) val (tlOut, edge) = outer.node.out(0) - tlOut.a.valid := tensor.io.reqA.valid - tlOut.a.bits.address := tensor.io.reqA.bits.address - tlOut.a.bits.source := 0.U // TODO: tensor.io.reqA.bits.source + val (tlOutB, edgeB) = outer.node.out(1) + + val zip = List((outer.node.out(0), tensor.io.reqA), + (outer.node.out(1), tensor.io.reqB)) + zip.foreach { case ((tl, edge), req) => + tl.a.valid := req.valid + val (legal, bits) = edge.Get( + fromSource = 0.U, // TODO: sourceGen.io.id.bits, + toAddress = req.bits.address, + lgSize = log2Ceil(wordSize).U + ) + tl.a.bits := bits + when(tl.a.fire) { + assert(legal, "illegal TL req gen") + } + } + tensor.io.respA.valid := tlOut.d.valid tensor.io.respA.bits.data := tlOut.d.bits.data // TODO: tensor.io.respA.bits.source := tlOut.d.bits.source @@ -204,12 +229,13 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) // a minimal Diplomacy graph with a tensor core and a TLRAM class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule { val tensor = LazyModule(new TensorCoreDecoupledTL) + val xbar = LazyModule(new TLXbar) val ram = LazyModule(new TLRAM( address = AddressSet(0x0000, 0xffffff), beatBytes = 32 // FIXME: hardcoded )) - ram.node :=* tensor.node + ram.node :=* xbar.node :=* tensor.node lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { From 14a640bf2d9a29be63666b9a56c0342ee1e717b7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 21:38:54 -0700 Subject: [PATCH 08/47] tensor: Do proper source generation SourceGenerator keeps on givin' --- .../radiance/core/TensorCoreDecoupled.scala | 88 ++++++++++++------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index e61b542..7fa05ee 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -8,8 +8,9 @@ import chisel3.util._ import org.chipsalliance.cde.config.Parameters import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} import freechips.rocketchip.tilelink._ -import freechips.rocketchip.diplomacy.AddressSet +import freechips.rocketchip.diplomacy.{IdRange, AddressSet} import freechips.rocketchip.unittest.{UnitTest, UnitTestModule} +import radiance.memory.SourceGenerator case class TensorTilingParams( // Dimension of the SMEM tile @@ -26,11 +27,13 @@ case class TensorTilingParams( class TensorCoreDecoupled( val numWarps: Int, val numLanes: Int, + val numSourceIds: Int, val tilingParams: TensorTilingParams ) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 val dataWidth = numLanes * wordSize // TODO FP16 + val sourceWidth = log2Ceil(numSourceIds) val io = IO(new Bundle { val initiate = Flipped(Decoupled(new Bundle { @@ -40,10 +43,10 @@ class TensorCoreDecoupled( val wid = UInt(numWarpBits.W) val last = Bool() }) - val respA = Flipped(Decoupled(new TensorMemResp(dataWidth))) - val respB = Flipped(Decoupled(new TensorMemResp(dataWidth))) - val reqA = Decoupled(new TensorMemReq) - val reqB = Decoupled(new TensorMemReq) + val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) + val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) + val reqA = Decoupled(new TensorMemReq(sourceWidth)) + val reqB = Decoupled(new TensorMemReq(sourceWidth)) }) dontTouch(io) @@ -106,13 +109,25 @@ class TensorCoreDecoupled( } // memory traffic generation - io.reqA.valid := (state === TensorState.run) // FIXME - io.reqA.bits.address := 0.U // FIXME + val genReq = (state === TensorState.run) + + List((io.reqA, io.respA), (io.reqB, io.respB)).foreach { + case (req, resp) => { + val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds))) + + sourceGen.io.gen := req.fire + sourceGen.io.meta := DontCare + req.valid := genReq + req.bits.address := 0.U // FIXME + req.bits.source := sourceGen.io.id.bits + + sourceGen.io.reclaim.valid := resp.fire + sourceGen.io.reclaim.bits := resp.bits.source + } + } + io.respA.ready := true.B io.respB.ready := true.B - // FIXME - io.reqB.valid := false.B - io.reqB.bits := DontCare // state transition logic switch(state) { @@ -150,12 +165,17 @@ class TensorCoreDecoupled( // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) } -class TensorMemReq extends Bundle { - // TODO: tag +class TensorMemReq( + sourceWidth: Int +) extends Bundle { + val source = UInt(sourceWidth.W) val address = UInt(32.W) } -class TensorMemResp(val dataWidth: Int) extends Bundle { - // TODO: tag +class TensorMemResp( + sourceWidth: Int, + dataWidth: Int +) extends Bundle { + val source = UInt(sourceWidth.W) val data = UInt(32.W) } @@ -164,18 +184,20 @@ class TensorMemResp(val dataWidth: Int) extends Bundle { // wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy // network. class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule { + val numSrcIds = 4 + // node with two edges; one for A and one for B matrix val node = TLClientNode(Seq( TLMasterPortParameters.v2( Seq(TLMasterParameters.v2( name = "TensorCoreDecoupledMatrixANode", - // sourceId : TODO + sourceId = IdRange(0, numSrcIds) )) ), TLMasterPortParameters.v2( Seq(TLMasterParameters.v2( name = "TensorCoreDecoupledMatrixBNode", - // sourceId : TODO + sourceId = IdRange(0, numSrcIds) )) ) )) @@ -185,42 +207,42 @@ class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule { class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) extends LazyModuleImp(outer) with UnitTestModule { - val tensor = Module(new TensorCoreDecoupled(8, 8, TensorTilingParams())) - val wordSize = 4 // FIXME: hardcoded - require(outer.node.out.length == 2/*A and B*/) - val (tlOut, edge) = outer.node.out(0) - val (tlOutB, edgeB) = outer.node.out(1) + val tensor = Module(new TensorCoreDecoupled( + 8, 8, outer.numSrcIds , TensorTilingParams())) + val wordSize = 4 // FIXME: hardcoded val zip = List((outer.node.out(0), tensor.io.reqA), (outer.node.out(1), tensor.io.reqB)) zip.foreach { case ((tl, edge), req) => tl.a.valid := req.valid val (legal, bits) = edge.Get( - fromSource = 0.U, // TODO: sourceGen.io.id.bits, + fromSource = req.bits.source, toAddress = req.bits.address, lgSize = log2Ceil(wordSize).U ) tl.a.bits := bits + req.ready := tl.a.ready when(tl.a.fire) { assert(legal, "illegal TL req gen") } } - tensor.io.respA.valid := tlOut.d.valid - tensor.io.respA.bits.data := tlOut.d.bits.data - // TODO: tensor.io.respA.bits.source := tlOut.d.bits.source + // TODO: dedup A and B + val (tlOutA, _) = outer.node.out(0) + val (tlOutB, _) = outer.node.out(1) + tensor.io.respA.valid := tlOutA.d.valid + tensor.io.respA.bits.data := tlOutA.d.bits.data + tensor.io.respA.bits.source := tlOutA.d.bits.source + tlOutA.d.ready := tensor.io.respA.ready + tensor.io.respB.valid := tlOutB.d.valid + tensor.io.respB.bits.data := tlOutB.d.bits.data + tensor.io.respB.bits.source := tlOutB.d.bits.source + tlOutB.d.ready := tensor.io.respB.ready tensor.io.initiate.valid := io.start - tensor.io.initiate.bits.wid := 0.U - // TODO - tensor.io.respA.valid := false.B - tensor.io.respA.bits := DontCare - tensor.io.respB.valid := false.B - tensor.io.respB.bits := DontCare - tensor.io.reqA.ready := true.B - tensor.io.reqB.ready := true.B + tensor.io.initiate.bits.wid := 0.U // FIXME tensor.io.writeback.ready := true.B io.finished := tensor.io.writeback.valid From 8d2e13b4ee660e1df22cb7838d1ad36d2f16966d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 22:06:58 -0700 Subject: [PATCH 09/47] tensor: Hold step until req fired for both A and B --- .../radiance/core/TensorCoreDecoupled.scala | 45 ++++++++++++------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 7fa05ee..05fe576 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -95,23 +95,10 @@ class TensorCoreDecoupled( busy := false.B } - // set/step sequencing logic - val nextStep = true.B // TODO - val lastSet = ((1 << setBits) - 1) - val lastStep = ((1 << stepBits) - 1) - val setDone = (set === lastSet.U) - val stepDone = (step === lastStep.U) - when (nextStep) { - step := (step + 1.U) & lastStep.U - when (stepDone) { - set := (set + 1.U) & lastSet.U - } - } - // memory traffic generation val genReq = (state === TensorState.run) - List((io.reqA, io.respA), (io.reqB, io.respB)).foreach { + Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach { case (req, resp) => { val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds))) @@ -126,9 +113,35 @@ class TensorCoreDecoupled( } } + // only advance to the next step if we fired mem requests for both A and B + val firedABReg = RegInit(VecInit(false.B, false.B)) + val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map { + case (req, fired) => { when (req.fire) { fired := true.B } } + req.fire + }) + val firedAB = (firedABNow.asUInt | firedABReg.asUInt) + val nextStep = firedAB.andR + // clear out firedABReg every step. this will overwrite the previous fired + // write upon the last fire out of A and B + when (nextStep) { + firedABReg := Seq(false.B, false.B) + } + io.respA.ready := true.B io.respB.ready := true.B + // set/step sequencing logic + val lastSet = ((1 << setBits) - 1) + val lastStep = ((1 << stepBits) - 1) + val setDone = (set === lastSet.U) + val stepDone = (step === lastStep.U) + when (nextStep) { + step := (step + 1.U) & lastStep.U + when (stepDone) { + set := (set + 1.U) & lastSet.U + } + } + // state transition logic switch(state) { is(TensorState.idle) { @@ -213,8 +226,8 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) 8, 8, outer.numSrcIds , TensorTilingParams())) val wordSize = 4 // FIXME: hardcoded - val zip = List((outer.node.out(0), tensor.io.reqA), - (outer.node.out(1), tensor.io.reqB)) + val zip = Seq((outer.node.out(0), tensor.io.reqA), + (outer.node.out(1), tensor.io.reqB)) zip.foreach { case ((tl, edge), req) => tl.a.valid := req.valid val (legal, bits) = edge.Get( From 90949f488bda6c65f4e23cea496983ba5ec53923 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 14 Oct 2024 22:34:11 -0700 Subject: [PATCH 10/47] tensor: Add memory response queue --- .../radiance/core/TensorCoreDecoupled.scala | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 05fe576..617659d 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -32,7 +32,7 @@ class TensorCoreDecoupled( ) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 - val dataWidth = numLanes * wordSize // TODO FP16 + val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 val sourceWidth = log2Ceil(numSourceIds) val io = IO(new Bundle { @@ -40,8 +40,9 @@ class TensorCoreDecoupled( val wid = UInt(numWarpBits.W) })) val writeback = Decoupled(new Bundle { - val wid = UInt(numWarpBits.W) val last = Bool() + val wid = UInt(numWarpBits.W) + val data = Vec(numLanes, UInt(wordSize.W)) }) val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) @@ -95,7 +96,9 @@ class TensorCoreDecoupled( busy := false.B } - // memory traffic generation + // Memory traffic generation + // ------------------------- + // val genReq = (state === TensorState.run) Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach { @@ -127,9 +130,33 @@ class TensorCoreDecoupled( firedABReg := Seq(false.B, false.B) } - io.respA.ready := true.B - io.respB.ready := true.B + io.respA.ready := true.B // FIXME + io.respB.ready := true.B // FIXME + // Execute stage + // ------------- + // Execute backend of the decoupled access/execute pipeline. + // + val respQueueDepth = 4 // FIXME: parameterize + val respQueueA = Queue(io.respA, respQueueDepth) + val respQueueB = Queue(io.respB, respQueueDepth) + respQueueA.ready := io.writeback.ready // FIXME + respQueueB.ready := io.writeback.ready // FIXME + + require(respQueueA.bits.data.widthOption.get == + io.writeback.bits.data.widthOption.get * numLanes, + "response data width does not match the writeback data width") + + // FIXME: debug dummy: pipe A directly to writeback + io.writeback.valid := respQueueA.valid + val groupedRespA = respQueueA.bits.data.asBools.grouped(wordSize * 8/*bits*/) + (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) => + wb := VecInit(data).asUInt + } + + // State transition + // ---------------- + // // set/step sequencing logic val lastSet = ((1 << setBits) - 1) val lastStep = ((1 << stepBits) - 1) @@ -142,7 +169,6 @@ class TensorCoreDecoupled( } } - // state transition logic switch(state) { is(TensorState.idle) { when(io.initiate.fire) { @@ -189,13 +215,13 @@ class TensorMemResp( dataWidth: Int ) extends Bundle { val source = UInt(sourceWidth.W) - val data = UInt(32.W) + val data = UInt(dataWidth.W) } // synthesizable unit tests -// wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy -// network. +// wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy +// graph. class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule { val numSrcIds = 4 From ab8d3554bb134fd96e622cd3c0a13406adbdff34 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 15:45:52 -0700 Subject: [PATCH 11/47] Bump vortex to tensor-decoupled --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index da54162..4dcbc31 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit da54162241da020807274bd4087844d379d8170e +Subproject commit 4dcbc31a88915fff35ccefd00c6e753fa5ef135a From 2ca2ee37b0fffeb7225940b03c206f3237f10b85 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 15:45:59 -0700 Subject: [PATCH 12/47] tensor: Fix writeback datawidth --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 617659d..65246f6 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -42,7 +42,7 @@ class TensorCoreDecoupled( val writeback = Decoupled(new Bundle { val last = Bool() val wid = UInt(numWarpBits.W) - val data = Vec(numLanes, UInt(wordSize.W)) + val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W)) }) val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) @@ -135,7 +135,7 @@ class TensorCoreDecoupled( // Execute stage // ------------- - // Execute backend of the decoupled access/execute pipeline. + // Backend of the decoupled access/execute pipeline. // val respQueueDepth = 4 // FIXME: parameterize val respQueueA = Queue(io.respA, respQueueDepth) @@ -144,7 +144,7 @@ class TensorCoreDecoupled( respQueueB.ready := io.writeback.ready // FIXME require(respQueueA.bits.data.widthOption.get == - io.writeback.bits.data.widthOption.get * numLanes, + io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") // FIXME: debug dummy: pipe A directly to writeback From de393115cd97f812ceb91ef94598ca8d46570202 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 16:48:39 -0700 Subject: [PATCH 13/47] tensor: Translate TL response source to set/step tag --- .../radiance/core/TensorCoreDecoupled.scala | 79 +++++++++++++------ 1 file changed, 53 insertions(+), 26 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 65246f6..43dc1ca 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -32,8 +32,8 @@ class TensorCoreDecoupled( ) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 - val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 val sourceWidth = log2Ceil(numSourceIds) + val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 val io = IO(new Bundle { val initiate = Flipped(Decoupled(new Bundle { @@ -51,6 +51,27 @@ class TensorCoreDecoupled( }) dontTouch(io) + class TensorMemReq( + sourceWidth: Int + ) extends Bundle { + val source = UInt(sourceWidth.W) + val address = UInt(32.W) + } + class TensorMemResp( + sourceWidth: Int, + dataWidth: Int + ) extends Bundle { + val source = UInt(sourceWidth.W) + val data = UInt(dataWidth.W) + } + // mem response after translation from TL source to set/step tag + class TensorMemRespWithTag( + dataWidth: Int + ) extends Bundle { + val tag = new TensorMemTag + val data = UInt(dataWidth.W) + } + // FSM // --- // This drives the overall pipeline of memory requests, dot-product unit @@ -101,18 +122,39 @@ class TensorCoreDecoupled( // val genReq = (state === TensorState.run) - Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach { - case (req, resp) => { - val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds))) + class TensorMemTag extends Bundle { + val set = UInt(setBits.W) + val step = UInt(stepBits.W) + } + // use concatenation of set/step as the memory request source. This will get + // translated to the actual TL sourcewidth in sourceGen. + val tag = Wire(new TensorMemTag) + tag.set := set + tag.step := step + + val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) + val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) + Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach { + case (req, (resp, respTagged)) => { + val sourceGen = Module(new SourceGenerator( + log2Ceil(numSourceIds), + metadata = Some(tag) + )) sourceGen.io.gen := req.fire - sourceGen.io.meta := DontCare + sourceGen.io.meta := tag req.valid := genReq req.bits.address := 0.U // FIXME req.bits.source := sourceGen.io.id.bits sourceGen.io.reclaim.valid := resp.fire sourceGen.io.reclaim.bits := resp.bits.source + + // translate source + respTagged.valid := resp.valid + respTagged.bits.tag := sourceGen.io.peek + respTagged.bits.data := resp.bits.data + resp.ready := respTagged.ready } } @@ -130,16 +172,13 @@ class TensorCoreDecoupled( firedABReg := Seq(false.B, false.B) } - io.respA.ready := true.B // FIXME - io.respB.ready := true.B // FIXME - // Execute stage // ------------- // Backend of the decoupled access/execute pipeline. // val respQueueDepth = 4 // FIXME: parameterize - val respQueueA = Queue(io.respA, respQueueDepth) - val respQueueB = Queue(io.respB, respQueueDepth) + val respQueueA = Queue(respATagged, respQueueDepth) + val respQueueB = Queue(respBTagged, respQueueDepth) respQueueA.ready := io.writeback.ready // FIXME respQueueB.ready := io.writeback.ready // FIXME @@ -149,9 +188,11 @@ class TensorCoreDecoupled( // FIXME: debug dummy: pipe A directly to writeback io.writeback.valid := respQueueA.valid - val groupedRespA = respQueueA.bits.data.asBools.grouped(wordSize * 8/*bits*/) + val groupedRespA = respQueueA.bits.data + .asBools.grouped(wordSize * 8/*bits*/) + .map(VecInit(_).asUInt) (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) => - wb := VecInit(data).asUInt + wb := data } // State transition @@ -204,20 +245,6 @@ class TensorCoreDecoupled( // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) } -class TensorMemReq( - sourceWidth: Int -) extends Bundle { - val source = UInt(sourceWidth.W) - val address = UInt(32.W) -} -class TensorMemResp( - sourceWidth: Int, - dataWidth: Int -) extends Bundle { - val source = UInt(sourceWidth.W) - val data = UInt(dataWidth.W) -} - // synthesizable unit tests // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy From efaf599fbe679f0e5e7ef671522408f34984057e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 17:08:14 -0700 Subject: [PATCH 14/47] tensor: Assert alignment of A and B response queues --- .../radiance/core/TensorCoreDecoupled.scala | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 43dc1ca..4f5ecb3 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -97,15 +97,16 @@ class TensorCoreDecoupled( // steps: i-j iteration val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc) val stepBits = log2Ceil(numSteps) - val set = RegInit(0.U(setBits.W)) - val step = RegInit(0.U(stepBits.W)) + // set and step being currently accessed in the acc/ex frontend + val setAccess = RegInit(0.U(setBits.W)) + val stepAccess = RegInit(0.U(stepBits.W)) when(io.initiate.fire) { val wid = io.initiate.bits.wid busy := true.B warpReg := wid - set := 0.U - step := 0.U + setAccess := 0.U + stepAccess := 0.U when(io.writeback.fire) { assert( io.writeback.bits.wid =/= wid, @@ -129,8 +130,8 @@ class TensorCoreDecoupled( // use concatenation of set/step as the memory request source. This will get // translated to the actual TL sourcewidth in sourceGen. val tag = Wire(new TensorMemTag) - tag.set := set - tag.step := step + tag.set := setAccess + tag.step := stepAccess val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) @@ -176,16 +177,32 @@ class TensorCoreDecoupled( // ------------- // Backend of the decoupled access/execute pipeline. // + // set and step being currently executed in the acc/ex backend + val setExecute = RegInit(0.U(setBits.W)) + val stepExecute = RegInit(0.U(stepBits.W)) + val respQueueDepth = 4 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) - respQueueA.ready := io.writeback.ready // FIXME - respQueueB.ready := io.writeback.ready // FIXME require(respQueueA.bits.data.widthOption.get == io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") + val bothQueueValid = (respQueueA.valid && respQueueB.valid) + // assume in-order response and that A/B responses are always aligned; this + // might be too strong an assumption depending on the backing memory + when (bothQueueValid) { + assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) && + (respQueueA.bits.tag.step === respQueueB.bits.tag.step), + "A and B response queue pointing to different set/steps. " ++ + "This might indicate memory response coming back out-of-order.") + } + // synchronized dequeue + val deqResp = bothQueueValid && io.writeback.ready + respQueueA.ready := deqResp + respQueueB.ready := deqResp + // FIXME: debug dummy: pipe A directly to writeback io.writeback.valid := respQueueA.valid val groupedRespA = respQueueA.bits.data @@ -201,12 +218,12 @@ class TensorCoreDecoupled( // set/step sequencing logic val lastSet = ((1 << setBits) - 1) val lastStep = ((1 << stepBits) - 1) - val setDone = (set === lastSet.U) - val stepDone = (step === lastStep.U) + val setDone = (setAccess === lastSet.U) + val stepDone = (stepAccess === lastStep.U) when (nextStep) { - step := (step + 1.U) & lastStep.U + stepAccess := (stepAccess + 1.U) & lastStep.U when (stepDone) { - set := (set + 1.U) & lastSet.U + setAccess := (setAccess + 1.U) & lastSet.U } } From e2abe1cffdc3a658b0acc5b2cb36a82d5a3450ec Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 19:12:15 -0700 Subject: [PATCH 15/47] tensor: Sequence set/steps in the execute-side --- .../radiance/core/TensorCoreDecoupled.scala | 52 ++++++++++++------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 4f5ecb3..7c07564 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -97,9 +97,16 @@ class TensorCoreDecoupled( // steps: i-j iteration val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc) val stepBits = log2Ceil(numSteps) + val lastSet = ((1 << setBits) - 1) + val lastStep = ((1 << stepBits) - 1) + def setDone(set: UInt) = (set === lastSet.U) + def stepDone(step: UInt) = (step === lastStep.U) + // set and step being currently accessed in the acc/ex frontend val setAccess = RegInit(0.U(setBits.W)) val stepAccess = RegInit(0.U(stepBits.W)) + dontTouch(setAccess) + dontTouch(stepAccess) when(io.initiate.fire) { val wid = io.initiate.bits.wid @@ -118,6 +125,9 @@ class TensorCoreDecoupled( busy := false.B } + // serialize every HGMMA request + io.initiate.ready := !busy + // Memory traffic generation // ------------------------- // @@ -166,10 +176,10 @@ class TensorCoreDecoupled( req.fire }) val firedAB = (firedABNow.asUInt | firedABReg.asUInt) - val nextStep = firedAB.andR + val nextStepAccess = firedAB.andR // clear out firedABReg every step. this will overwrite the previous fired // write upon the last fire out of A and B - when (nextStep) { + when (nextStepAccess) { firedABReg := Seq(false.B, false.B) } @@ -180,6 +190,8 @@ class TensorCoreDecoupled( // set and step being currently executed in the acc/ex backend val setExecute = RegInit(0.U(setBits.W)) val stepExecute = RegInit(0.U(stepBits.W)) + dontTouch(setExecute) + dontTouch(stepExecute) val respQueueDepth = 4 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) @@ -198,13 +210,19 @@ class TensorCoreDecoupled( "A and B response queue pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } - // synchronized dequeue + // dequeue is synchronized between A and B + // FIXME: this need to change to dpu_ready val deqResp = bothQueueValid && io.writeback.ready respQueueA.ready := deqResp respQueueB.ready := deqResp + // FIXME: this need to change to dpu_fire + val nextStepExecute = io.writeback.fire + + io.writeback.valid := bothQueueValid + io.writeback.bits.wid := warpReg + io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) // FIXME: debug dummy: pipe A directly to writeback - io.writeback.valid := respQueueA.valid val groupedRespA = respQueueA.bits.data .asBools.grouped(wordSize * 8/*bits*/) .map(VecInit(_).asUInt) @@ -216,16 +234,17 @@ class TensorCoreDecoupled( // ---------------- // // set/step sequencing logic - val lastSet = ((1 << setBits) - 1) - val lastStep = ((1 << stepBits) - 1) - val setDone = (setAccess === lastSet.U) - val stepDone = (stepAccess === lastStep.U) - when (nextStep) { - stepAccess := (stepAccess + 1.U) & lastStep.U - when (stepDone) { - setAccess := (setAccess + 1.U) & lastSet.U + + def sequenceSetStep(set: UInt, step: UInt, nextStep: Bool) = { + when (nextStep) { + step := (step + 1.U) & lastStep.U + when (stepDone(step)) { + set := (set + 1.U) & lastSet.U + } } } + sequenceSetStep(setAccess, stepAccess, nextStepAccess) + sequenceSetStep(setExecute, stepExecute, nextStepExecute) switch(state) { is(TensorState.idle) { @@ -234,7 +253,7 @@ class TensorCoreDecoupled( } } is(TensorState.run) { - when (setDone && stepDone && nextStep) { + when (setDone(setAccess) && stepDone(stepAccess) && nextStepAccess) { when (state === TensorState.run) { state := TensorState.finish } @@ -247,11 +266,6 @@ class TensorCoreDecoupled( } } - io.initiate.ready := !busy - io.writeback.valid := (state === TensorState.finish) - io.writeback.bits.wid := warpReg - io.writeback.bits.last := false.B // TODO - // Writeback queues // ---------------- // These queues hold the metadata necessary for register @@ -328,7 +342,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) tensor.io.initiate.bits.wid := 0.U // FIXME tensor.io.writeback.ready := true.B - io.finished := tensor.io.writeback.valid + io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last } // a minimal Diplomacy graph with a tensor core and a TLRAM From 444dd5d7e1c54ab78111fdcfac9ebf3145809f02 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 14:25:38 -0700 Subject: [PATCH 16/47] tensor: Add destination reg to IO --- .../scala/radiance/core/TensorCoreDecoupled.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 7c07564..92f98b7 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -28,12 +28,14 @@ class TensorCoreDecoupled( val numWarps: Int, val numLanes: Int, val numSourceIds: Int, - val tilingParams: TensorTilingParams + val tilingParams: TensorTilingParams, + val numFPRegs: Int = 32 ) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 val sourceWidth = log2Ceil(numSourceIds) val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 + val numFPRegBits = log2Ceil(numFPRegs) val io = IO(new Bundle { val initiate = Flipped(Decoupled(new Bundle { @@ -42,6 +44,7 @@ class TensorCoreDecoupled( val writeback = Decoupled(new Bundle { val last = Bool() val wid = UInt(numWarpBits.W) + val rd = UInt(numFPRegBits.W) val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W)) }) val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) @@ -218,8 +221,17 @@ class TensorCoreDecoupled( // FIXME: this need to change to dpu_fire val nextStepExecute = io.writeback.fire + def rdGen(set: UInt, step: UInt): UInt = { + // each step produces 4x4 output tile, written by 8 threads with 2 regs per + // thread + require(numLanes == 8, "currently assumes 8-wide warps") + (Cat(set, step) >> 1/*2 regs/thread*/) + // FIXME: add substep here + } + io.writeback.valid := bothQueueValid io.writeback.bits.wid := warpReg + io.writeback.bits.rd := rdGen(setExecute, stepExecute) io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) // FIXME: debug dummy: pipe A directly to writeback From 77dae3e1f9941d15c213b19a43cd82bd0e00c81c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 21:21:48 -0700 Subject: [PATCH 17/47] tensor: Write staging pipeline for A tile --- .../radiance/core/TensorCoreDecoupled.scala | 103 ++++++++++++++---- src/main/scala/radiance/core/TensorDPU.scala | 1 + 2 files changed, 83 insertions(+), 21 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 92f98b7..69b84f9 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -108,8 +108,12 @@ class TensorCoreDecoupled( // set and step being currently accessed in the acc/ex frontend val setAccess = RegInit(0.U(setBits.W)) val stepAccess = RegInit(0.U(stepBits.W)) + // we need full 4x4 A tile to fire DPU, but since the memory width is 8 + // words, we need 2 cycles to read A. `substep` tells which cycle we're at. + val substepAccess = RegInit(0.U(1.W)) dontTouch(setAccess) dontTouch(stepAccess) + dontTouch(substepAccess) when(io.initiate.fire) { val wid = io.initiate.bits.wid @@ -139,16 +143,19 @@ class TensorCoreDecoupled( class TensorMemTag extends Bundle { val set = UInt(setBits.W) val step = UInt(stepBits.W) + val substep = UInt(1.W) } // use concatenation of set/step as the memory request source. This will get // translated to the actual TL sourcewidth in sourceGen. val tag = Wire(new TensorMemTag) tag.set := setAccess tag.step := stepAccess + tag.substep := substepAccess val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) - Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach { + Seq((io.reqA, (io.respA, respATagged)), + (io.reqB, (io.respB, respBTagged))).foreach { case (req, (resp, respTagged)) => { val sourceGen = Module(new SourceGenerator( log2Ceil(numSourceIds), @@ -173,18 +180,22 @@ class TensorCoreDecoupled( } // only advance to the next step if we fired mem requests for both A and B + // TODO: @perf: too strict? should be able to have A and B progress + // separately val firedABReg = RegInit(VecInit(false.B, false.B)) val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map { case (req, fired) => { when (req.fire) { fired := true.B } } req.fire }) val firedAB = (firedABNow.asUInt | firedABReg.asUInt) - val nextStepAccess = firedAB.andR - // clear out firedABReg every step. this will overwrite the previous fired - // write upon the last fire out of A and B - when (nextStepAccess) { + val nextSubstepAccess = firedAB.andR + val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U) + // clear out firedABReg every substep + when (nextSubstepAccess) { firedABReg := Seq(false.B, false.B) + substepAccess := substepAccess + 1.U } + require(substepAccess.widthOption.get == 1, "there should be only two substeps") // Execute stage // ------------- @@ -204,22 +215,72 @@ class TensorCoreDecoupled( io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") - val bothQueueValid = (respQueueA.valid && respQueueB.valid) - // assume in-order response and that A/B responses are always aligned; this - // might be too strong an assumption depending on the backing memory - when (bothQueueValid) { - assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) && - (respQueueA.bits.tag.step === respQueueB.bits.tag.step), - "A and B response queue pointing to different set/steps. " ++ - "This might indicate memory response coming back out-of-order.") - } - // dequeue is synchronized between A and B // FIXME: this need to change to dpu_ready - val deqResp = bothQueueValid && io.writeback.ready - respQueueA.ready := deqResp - respQueueB.ready := deqResp - // FIXME: this need to change to dpu_fire - val nextStepExecute = io.writeback.fire + val dpuReady = io.writeback.ready // FIXME: this need be actual dpu + + val substepExecute = RegInit(0.U(1.W)) + when (respQueueA.fire) { + substepExecute := substepExecute + 1.U + } + dontTouch(substepExecute) + + // note combinationally coupled ready with `pipe` + val halfAQueue = Module(new Queue( + chiselTypeOf(respQueueA.bits.data), entries = 1, pipe = true + )) + halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U) + halfAQueue.io.enq.bits := respQueueA.bits.data + + // we need the full data for A because we divide the D tile by half along N; + // for B, the DPU can immediately start computing with a 4x2 tile. + // + // substep == 0 data goes to the LSB + val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits) + val fullAQueue = Module(new Queue( + chiselTypeOf(fullAEnqData), entries = 1, pipe = true + )) + // hold first half A data for the first substep + halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) && + fullAQueue.io.enq.ready + + require(fullAEnqData.widthOption.get == dataWidth * 2, + "assumes 2-cycle read for a full compute tile of A") + fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) && + halfAQueue.io.deq.valid + fullAQueue.io.enq.bits := fullAEnqData + + val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME? + val dpuFire = operandsValid && dpuReady + fullAQueue.io.deq.ready := dpuFire + val nextStepExecute = dpuFire + + // FIXME: need to hold A for two cycles!! + + // make sure to dequeue from response queues only when both A and B valid + respQueueA.ready := MuxCase(false.B, + Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready, + (substepExecute === 1.U) -> fullAQueue.io.enq.ready)) + respQueueB.ready := dpuFire + dontTouch(respQueueA) + dontTouch(respQueueB) + + // assert that the A and B response queue heads always point to the same + // set/step/substep + // + // this assumes that memory responses come back in-order. this might be too + // strong an assumption depending on the backing memory + def assertAligned = { + val bothQueueValid = (respQueueA.valid && respQueueB.valid) + when (bothQueueValid && (substepExecute === 0.U)) { + assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) && + (respQueueA.bits.tag.step === respQueueB.bits.tag.step), + "A and B response queue pointing to different set/steps. " ++ + "This might indicate memory response coming back out-of-order.") + } + dontTouch(respQueueA.bits.tag) + dontTouch(respQueueB.bits.tag) + } + assertAligned def rdGen(set: UInt, step: UInt): UInt = { // each step produces 4x4 output tile, written by 8 threads with 2 regs per @@ -229,7 +290,7 @@ class TensorCoreDecoupled( // FIXME: add substep here } - io.writeback.valid := bothQueueValid + io.writeback.valid := operandsValid // FIXME: bypass logic io.writeback.bits.wid := warpReg io.writeback.bits.rd := rdGen(setExecute, stepExecute) io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala index 4e6cee7..a82bed7 100644 --- a/src/main/scala/radiance/core/TensorDPU.scala +++ b/src/main/scala/radiance/core/TensorDPU.scala @@ -27,6 +27,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar val b = Vec(dotProductDim, Bits((inFLen).W)) val c = Bits((outFLen).W) // note C has the out length for accumulation })) + // 'stall' is effectively out.ready, combinationally coupled to in.ready val stall = Input(Bool()) val out = Valid(new Bundle { val data = Bits((outFLen).W) From 6cad8edd1838642cbbb61ef6998c8318d96864e1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 22:01:02 -0700 Subject: [PATCH 18/47] tensor: Fix operand alignment in pipelining --- .../radiance/core/TensorCoreDecoupled.scala | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 69b84f9..0654df3 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -224,37 +224,51 @@ class TensorCoreDecoupled( } dontTouch(substepExecute) + // Do pipelining for the A operand so that we obtain the full 4x4 A tile + // ready for compute. The pipeline is two-stage: + // - stage one (halfAQueue) for assembling the full A tile from half-tiles + // coming from the resp queue, and + // - stage two (fullAQueue) for holding the full A tile until it gets + // matched with two 4x2 B tiles, and compute is complete. + // + // Note that the half-tile assembly is unnecessary for B since the B tile is + // only 4x2. + // Also send the set/step tag along the pipe for alignment check. + // note combinationally coupled ready with `pipe` val halfAQueue = Module(new Queue( - chiselTypeOf(respQueueA.bits.data), entries = 1, pipe = true + chiselTypeOf(respQueueA.bits), entries = 1, pipe = true )) halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U) - halfAQueue.io.enq.bits := respQueueA.bits.data + halfAQueue.io.enq.bits := respQueueA.bits - // we need the full data for A because we divide the D tile by half along N; - // for B, the DPU can immediately start computing with a 4x2 tile. - // // substep == 0 data goes to the LSB - val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits) + val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits.data) + require(fullAEnqData.widthOption.get == dataWidth * 2, + "assumes 2-cycle read for a full compute tile of A") + // only use the lower halfA's tag. substep will be incorrect. + val fullAEnqTag = halfAQueue.io.deq.bits.tag val fullAQueue = Module(new Queue( - chiselTypeOf(fullAEnqData), entries = 1, pipe = true + new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true )) // hold first half A data for the first substep halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) && fullAQueue.io.enq.ready - - require(fullAEnqData.widthOption.get == dataWidth * 2, - "assumes 2-cycle read for a full compute tile of A") fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) && halfAQueue.io.deq.valid - fullAQueue.io.enq.bits := fullAEnqData + fullAQueue.io.enq.bits.data := fullAEnqData + fullAQueue.io.enq.bits.tag := fullAEnqTag val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME? val dpuFire = operandsValid && dpuReady - fullAQueue.io.deq.ready := dpuFire - val nextStepExecute = dpuFire + val substepCompute = RegInit(0.U(1.W)) + when (dpuFire) { + substepCompute := substepCompute + 1.U + } - // FIXME: need to hold A for two cycles!! + // hold full A until two-cycle compute is done + fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) + val nextStepExecute = dpuFire && (substepCompute === 1.U) // make sure to dequeue from response queues only when both A and B valid respQueueA.ready := MuxCase(false.B, @@ -264,21 +278,17 @@ class TensorCoreDecoupled( dontTouch(respQueueA) dontTouch(respQueueB) - // assert that the A and B response queue heads always point to the same - // set/step/substep + // assert that the DPU is computing with operands of the same set/step // // this assumes that memory responses come back in-order. this might be too // strong an assumption depending on the backing memory def assertAligned = { - val bothQueueValid = (respQueueA.valid && respQueueB.valid) - when (bothQueueValid && (substepExecute === 0.U)) { - assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) && - (respQueueA.bits.tag.step === respQueueB.bits.tag.step), - "A and B response queue pointing to different set/steps. " ++ + when (dpuFire) { + assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) && + (fullAQueue.io.deq.bits.tag.step === respQueueB.bits.tag.step), + "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } - dontTouch(respQueueA.bits.tag) - dontTouch(respQueueB.bits.tag) } assertAligned From 23edc34c7ebc28623a5961abc654d7f4049c4864 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 22:15:35 -0700 Subject: [PATCH 19/47] tensor: Add two TLRAM config for full throughput test --- .../radiance/core/TensorCoreDecoupled.scala | 38 ++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 0654df3..154a3cf 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -155,8 +155,8 @@ class TensorCoreDecoupled( val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), - (io.reqB, (io.respB, respBTagged))).foreach { - case (req, (resp, respTagged)) => { + (io.reqB, (io.respB, respBTagged))).zipWithIndex.foreach { + case ((req, (resp, respTagged)), i) => { val sourceGen = Module(new SourceGenerator( log2Ceil(numSourceIds), metadata = Some(tag) @@ -165,7 +165,9 @@ class TensorCoreDecoupled( sourceGen.io.gen := req.fire sourceGen.io.meta := tag req.valid := genReq - req.bits.address := 0.U // FIXME + // FIXME: bogus address + // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B + req.bits.address := 0.U req.bits.source := sourceGen.io.id.bits sourceGen.io.reclaim.valid := resp.fire @@ -270,7 +272,8 @@ class TensorCoreDecoupled( fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) val nextStepExecute = dpuFire && (substepCompute === 1.U) - // make sure to dequeue from response queues only when both A and B valid + // respQueueA output arbitrates to either halfAQueue or fullAQueue depending + // on the substep respQueueA.ready := MuxCase(false.B, Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready, (substepExecute === 1.U) -> fullAQueue.io.enq.ready)) @@ -446,10 +449,35 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule { } } +// two separate TLRAMs for A and B for full throughput +class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { + val tensor = LazyModule(new TensorCoreDecoupledTL) + val xbar = LazyModule(new TLXbar) + val ramA = LazyModule(new TLRAM( + address = AddressSet(0x000, 0xfffeff), + beatBytes = 32 // FIXME: hardcoded + )) + val ramB = LazyModule(new TLRAM( + address = AddressSet(0x100, 0xfffeff), + beatBytes = 32 // FIXME: hardcoded + )) + + xbar.node :=* tensor.node + ramA.node := xbar.node + ramB.node := xbar.node + + lazy val module = new Impl + class Impl extends LazyModuleImp(this) with UnitTestModule { + tensor.module.io.start := io.start + io.finished := tensor.module.io.finished + } +} + // unit test harness class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module) + // val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module) + val dut = Module(LazyModule(new TensorCoreDecoupledTwoTLRAM).module) dut.io.start := io.start io.finished := dut.io.finished } From e1e3ac8274bd02954ff4d64ad9462ef4a8bb2f1b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 22:22:27 -0700 Subject: [PATCH 20/47] tensor: Fix busy state --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 154a3cf..652608b 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -128,7 +128,13 @@ class TensorCoreDecoupled( ) } } - when(io.writeback.fire) { + + // TODO: @perf: Instead of waiting until the last writeback, release busy as + // soon as the access frontend is complete so that there's a better chance to + // saturate the backend with back-to-back HGMMAs. This would require sending + // the 'wid' register to backend instead of having it shared with the + // frontend. + when(io.writeback.fire && io.writeback.bits.last) { busy := false.B } From 8847278ad1d54fe3167f01e0b9f70fcd3dd01096 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 17 Oct 2024 14:37:33 -0700 Subject: [PATCH 21/47] tensor: Instantiate actual DPU --- .../radiance/core/TensorCoreDecoupled.scala | 93 +++++++++++++++---- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 652608b..b9695ad 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -33,8 +33,9 @@ class TensorCoreDecoupled( ) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 + val wordSizeInBits = wordSize * 8 // TODO FP16 val sourceWidth = log2Ceil(numSourceIds) - val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 + val dataWidth = numLanes * wordSizeInBits // TODO FP16 val numFPRegBits = log2Ceil(numFPRegs) val io = IO(new Bundle { @@ -45,7 +46,7 @@ class TensorCoreDecoupled( val last = Bool() val wid = UInt(numWarpBits.W) val rd = UInt(numFPRegBits.W) - val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W)) + val data = Vec(numLanes, UInt((wordSizeInBits).W)) }) val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) @@ -223,9 +224,6 @@ class TensorCoreDecoupled( io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") - // FIXME: this need to change to dpu_ready - val dpuReady = io.writeback.ready // FIXME: this need be actual dpu - val substepExecute = RegInit(0.U(1.W)) when (respQueueA.fire) { substepExecute := substepExecute + 1.U @@ -267,7 +265,10 @@ class TensorCoreDecoupled( fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag - val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME? + val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid + val operandA = fullAQueue.io.deq.bits.data + val operandB = respQueueB.bits.data + val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val substepCompute = RegInit(0.U(1.W)) when (dpuFire) { @@ -301,6 +302,66 @@ class TensorCoreDecoupled( } assertAligned + // Dot-product unit + // + // 4x2 four-element DPUs summing up to 32 MACs in total + val dpus = Seq.fill(4)(Seq.fill(2)( + Module(new TensorDotProductUnit(half = false)) + )) + // operandA is 4x4 in K-major + val operandADimensional = + operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + .grouped(4).toSeq + println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits") + println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}") + assert(operandADimensional.length == tilingParams.mc && + operandADimensional(0).length == tilingParams.kc, + "operand width doesn't agree with tiling parameter") + // operandB is 2x4, i.e. 4x2 in N-major + val operandBDimensional = + operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + .grouped(4).toSeq + println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}") + val ncSubstep = tilingParams.nc / 2 + assert(tilingParams.mc * ncSubstep == numLanes, + "substep tile size doesn't match writeback throughput") + assert(operandBDimensional.length == ncSubstep && + operandBDimensional(0).length == tilingParams.kc, + "operand width doesn't agree with tiling parameter") + + for (m <- 0 until tilingParams.mc) { + for (n <- 0 until ncSubstep) { + dpus(m)(n).io.in.valid := dpuFire + dpus(m)(n).io.in.bits.a := operandADimensional(m) + dpus(m)(n).io.in.bits.b := operandBDimensional(n) + dpus(m)(n).io.in.bits.c := 0.U // FIXME: bogus accum data + // dpu ready couples with writeback backpressure + dpus(m)(n).io.stall := !io.writeback.ready + } + } + dpuReady := !dpus(0)(0).io.stall + dontTouch(dpuFire) + dontTouch(dpuReady) + + val dpuValids = dpus.flatMap(_.map(_.io.out.valid)) + val dpuValid = dpuValids.reduce(_ && _) + def assertDPU = { + val dpuStalls = dpus.flatMap(_.map(_.io.stall)) + assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _), + "stall signals of DPUs went unaligned") + assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _), + "valid signals of DPUs went unaligned") + } + assertDPU + + // flatten DPU output into 1D array in M-major order + val flattenedDPUOut = (0 until ncSubstep).flatMap { n => + (0 until tilingParams.mc).map { m => + dpus(m)(n).io.out.bits.data + } + } + io.writeback.bits.data := flattenedDPUOut + def rdGen(set: UInt, step: UInt): UInt = { // each step produces 4x4 output tile, written by 8 threads with 2 regs per // thread @@ -309,19 +370,11 @@ class TensorCoreDecoupled( // FIXME: add substep here } - io.writeback.valid := operandsValid // FIXME: bypass logic + io.writeback.valid := dpuValid io.writeback.bits.wid := warpReg io.writeback.bits.rd := rdGen(setExecute, stepExecute) io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) - // FIXME: debug dummy: pipe A directly to writeback - val groupedRespA = respQueueA.bits.data - .asBools.grouped(wordSize * 8/*bits*/) - .map(VecInit(_).asUInt) - (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) => - wb := data - } - // State transition // ---------------- // @@ -400,7 +453,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) val tensor = Module(new TensorCoreDecoupled( 8, 8, outer.numSrcIds , TensorTilingParams())) - val wordSize = 4 // FIXME: hardcoded + val wordSize = 4 // @cleanup: hardcoded val zip = Seq((outer.node.out(0), tensor.io.reqA), (outer.node.out(1), tensor.io.reqB)) @@ -431,7 +484,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) tlOutB.d.ready := tensor.io.respB.ready tensor.io.initiate.valid := io.start - tensor.io.initiate.bits.wid := 0.U // FIXME + tensor.io.initiate.bits.wid := 0.U // TODO tensor.io.writeback.ready := true.B io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last @@ -443,7 +496,7 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule { val xbar = LazyModule(new TLXbar) val ram = LazyModule(new TLRAM( address = AddressSet(0x0000, 0xffffff), - beatBytes = 32 // FIXME: hardcoded + beatBytes = 32 // @cleanup: hardcoded )) ram.node :=* xbar.node :=* tensor.node @@ -461,11 +514,11 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { val xbar = LazyModule(new TLXbar) val ramA = LazyModule(new TLRAM( address = AddressSet(0x000, 0xfffeff), - beatBytes = 32 // FIXME: hardcoded + beatBytes = 32 // @cleanup: hardcoded )) val ramB = LazyModule(new TLRAM( address = AddressSet(0x100, 0xfffeff), - beatBytes = 32 // FIXME: hardcoded + beatBytes = 32 // @cleanup: hardcoded )) xbar.node :=* tensor.node From 7de8e86d4f04712f90c4457940c02a341b721f76 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 17 Oct 2024 15:18:47 -0700 Subject: [PATCH 22/47] tensor: Sync rd with DPU using a queue --- .../radiance/core/TensorCoreDecoupled.scala | 44 ++++++++++++------- src/main/scala/radiance/core/TensorDPU.scala | 2 +- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index b9695ad..92a6596 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -270,6 +270,8 @@ class TensorCoreDecoupled( val operandB = respQueueB.bits.data val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady + val setCompute = fullAQueue.io.deq.bits.tag.set + val stepCompute = fullAQueue.io.deq.bits.tag.step val substepCompute = RegInit(0.U(1.W)) when (dpuFire) { substepCompute := substepCompute + 1.U @@ -348,9 +350,9 @@ class TensorCoreDecoupled( def assertDPU = { val dpuStalls = dpus.flatMap(_.map(_.io.stall)) assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _), - "stall signals of DPUs went unaligned") + "stall signals of DPUs went out of sync") assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _), - "valid signals of DPUs went unaligned") + "valid signals of DPUs went out of sync") } assertDPU @@ -362,17 +364,36 @@ class TensorCoreDecoupled( } io.writeback.bits.data := flattenedDPUOut - def rdGen(set: UInt, step: UInt): UInt = { + // Writeback queues + // ---------------- + // These queues hold metadata needed for writeback in sync with the DPU. + + val queueDepth = 4 // needs to be at least the DPU latency + val rdQueue = Module(new Queue( + chiselTypeOf(io.writeback.bits.rd), queueDepth + )) + rdQueue.io.enq.valid := dpuFire + rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute) + rdQueue.io.deq.ready := io.writeback.fire + assert(rdQueue.io.enq.ready === true.B, + "rd queue full, throttling DPU operation") + assert(!dpuValid || rdQueue.io.deq.valid, + "rd queue and DPU went out of sync") + + // TODO: decouple wid from frontend + // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) + + // note rd is independent to sets + def rdGen(step: UInt, substep: UInt): UInt = { // each step produces 4x4 output tile, written by 8 threads with 2 regs per // thread - require(numLanes == 8, "currently assumes 8-wide warps") - (Cat(set, step) >> 1/*2 regs/thread*/) - // FIXME: add substep here + (step << 1/*2 substeps*/) + substep } io.writeback.valid := dpuValid io.writeback.bits.wid := warpReg - io.writeback.bits.rd := rdGen(setExecute, stepExecute) + io.writeback.bits.rd := rdQueue.io.deq.bits + // FIXME: look at set/step of dpu output not setExecute io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) // State transition @@ -410,15 +431,6 @@ class TensorCoreDecoupled( } } } - - // Writeback queues - // ---------------- - // These queues hold the metadata necessary for register - // writeback. - - // val queueDepth = 2 - // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) - // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) } // synthesizable unit tests diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala index a82bed7..515b1bf 100644 --- a/src/main/scala/radiance/core/TensorDPU.scala +++ b/src/main/scala/radiance/core/TensorDPU.scala @@ -53,7 +53,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar io.out.bits.data := ieee(box(dpu.io.out.bits.data, S)) } -// Copied from chisel3.util.Pipe. +// An implementation of chisel3.util.Pipe that supports stalls. class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module { /** A non-ambiguous name of this `StallingPipe` for use in generated Verilog * names. Includes the latency cycle count in the name as well as the From 2741af0b2b36026cfe57ca227eb469d6643d4c12 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 17 Oct 2024 15:43:44 -0700 Subject: [PATCH 23/47] tensor: Keep set/step in the tag writeback queue --- .../radiance/core/TensorCoreDecoupled.scala | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 92a6596..3d00c35 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -267,6 +267,7 @@ class TensorCoreDecoupled( val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid val operandA = fullAQueue.io.deq.bits.data + val operandATag = fullAQueue.io.deq.bits.tag val operandB = respQueueB.bits.data val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady @@ -314,8 +315,6 @@ class TensorCoreDecoupled( val operandADimensional = operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4).toSeq - println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits") - println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}") assert(operandADimensional.length == tilingParams.mc && operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") @@ -323,7 +322,6 @@ class TensorCoreDecoupled( val operandBDimensional = operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4).toSeq - println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}") val ncSubstep = tilingParams.nc / 2 assert(tilingParams.mc * ncSubstep == numLanes, "substep tile size doesn't match writeback throughput") @@ -369,18 +367,20 @@ class TensorCoreDecoupled( // These queues hold metadata needed for writeback in sync with the DPU. val queueDepth = 4 // needs to be at least the DPU latency - val rdQueue = Module(new Queue( - chiselTypeOf(io.writeback.bits.rd), queueDepth + val tagQueue = Module(new Queue( + chiselTypeOf(operandATag), queueDepth )) - rdQueue.io.enq.valid := dpuFire - rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute) - rdQueue.io.deq.ready := io.writeback.fire - assert(rdQueue.io.enq.ready === true.B, - "rd queue full, throttling DPU operation") - assert(!dpuValid || rdQueue.io.deq.valid, - "rd queue and DPU went out of sync") + tagQueue.io.enq.valid := dpuFire + // A and B should have the same tags + tagQueue.io.enq.bits := operandATag + // @cleanup: awkward + tagQueue.io.enq.bits.substep := substepCompute + tagQueue.io.deq.ready := io.writeback.fire + assert(tagQueue.io.enq.ready === true.B, + "tag queue full, DPU operation might be throttled") + assert(!dpuValid || tagQueue.io.deq.valid, + "tag queue and DPU went out of sync") - // TODO: decouple wid from frontend // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) // note rd is independent to sets @@ -390,11 +390,14 @@ class TensorCoreDecoupled( (step << 1/*2 substeps*/) + substep } + val setWriteback = tagQueue.io.deq.bits.set + val stepWriteback = tagQueue.io.deq.bits.step + val substepWriteback = tagQueue.io.deq.bits.substep io.writeback.valid := dpuValid + // TODO: decouple wid from frontend io.writeback.bits.wid := warpReg - io.writeback.bits.rd := rdQueue.io.deq.bits - // FIXME: look at set/step of dpu output not setExecute - io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) + io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback) + io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) // State transition // ---------------- @@ -500,6 +503,10 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) tensor.io.writeback.ready := true.B io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last + when (io.finished) { + // might be too strong + assert(tensor.io.writeback.bits.rd === 31.U) + } } // a minimal Diplomacy graph with a tensor core and a TLRAM From a2519da58fe1397a7570656a4726f42693e8d845 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 17 Oct 2024 16:36:18 -0700 Subject: [PATCH 24/47] tensor: SMEM address generation --- .../radiance/core/TensorCoreDecoupled.scala | 51 +++++++++++++++++-- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 3d00c35..f7c8547 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -159,6 +159,48 @@ class TensorCoreDecoupled( tag.step := stepAccess tag.substep := substepAccess + // @cleanup: generalize in terms of M/N/K-majorness? + def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt) + : (UInt/*A*/, UInt/*B*/) = { + // note that step iterates along N first, then M + val numComputeTilesM = tilingParams.m / tilingParams.mc + val numComputeTilesN = tilingParams.n / tilingParams.nc + val tileM = step % numComputeTilesM.U + val tileN = step / numComputeTilesM.U + val mcSubstep = tilingParams.mc / 2 + val ncSubstep = tilingParams.nc / 2 + + // note that both A and B are K-major to facilitate bank conflict-free SMEM + // accesses + // + // (row,col) coordinate of the compute tile + val tileRowA = tileM // M + val tileColA = set // K + val tileRowB = tileN // N + val tileColB = set // K + // (row,col) coordinate of the starting element of the compute tile + val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) + + (substep << log2Ceil(mcSubstep)) + val elemColA = tileColA << log2Ceil(tilingParams.kc) + val elemRowB = tileRowB << log2Ceil(tilingParams.nc) + (substep << log2Ceil(ncSubstep)) + val elemColB = tileColB << log2Ceil(tilingParams.kc) + val rowStrideA = wordSize * tilingParams.k + val rowStrideABits = log2Ceil(rowStrideA) + val rowStrideB = wordSize * tilingParams.k + val rowStrideBBits = log2Ceil(rowStrideB) + val wordStrideBits = log2Ceil(wordSize) + + val tileOffsetA = (elemRowA << rowStrideABits) + (elemColA << wordStrideBits) + val tileOffsetB = (elemRowB << rowStrideBBits) + (elemColB << wordStrideBits) + + (baseA + tileOffsetA, baseB + tileOffsetB) + } + + // FIXME: bogus base address + val (addressA, addressB) = + addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess) + val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), @@ -172,9 +214,7 @@ class TensorCoreDecoupled( sourceGen.io.gen := req.fire sourceGen.io.meta := tag req.valid := genReq - // FIXME: bogus address - // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B - req.bits.address := 0.U + req.bits.address := (if (i == 0) addressA else addressB) req.bits.source := sourceGen.io.id.bits sourceGen.io.reclaim.valid := resp.fire @@ -366,7 +406,7 @@ class TensorCoreDecoupled( // ---------------- // These queues hold metadata needed for writeback in sync with the DPU. - val queueDepth = 4 // needs to be at least the DPU latency + val queueDepth = 6 // needs to be at least the DPU latency val tagQueue = Module(new Queue( chiselTypeOf(operandATag), queueDepth )) @@ -397,7 +437,8 @@ class TensorCoreDecoupled( // TODO: decouple wid from frontend io.writeback.bits.wid := warpReg io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback) - io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) + io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) && + (substepWriteback === 1.U) // State transition // ---------------- From 64ea48ace3681e0a74a732fb4da006717e62b873 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 13:46:04 -0700 Subject: [PATCH 25/47] tensor: Consider data reuse for B memory request B is reused every 4 steps because of the k->i->j iteration order. --- .../radiance/core/TensorCoreDecoupled.scala | 111 ++++++++++-------- 1 file changed, 62 insertions(+), 49 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index f7c8547..897edb2 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -145,8 +145,6 @@ class TensorCoreDecoupled( // Memory traffic generation // ------------------------- // - val genReq = (state === TensorState.run) - class TensorMemTag extends Bundle { val set = UInt(setBits.W) val step = UInt(stepBits.W) @@ -159,16 +157,14 @@ class TensorCoreDecoupled( tag.step := stepAccess tag.substep := substepAccess + val numTilesM = tilingParams.m / tilingParams.mc + val numTilesN = tilingParams.n / tilingParams.nc // @cleanup: generalize in terms of M/N/K-majorness? def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt) : (UInt/*A*/, UInt/*B*/) = { // note that step iterates along N first, then M - val numComputeTilesM = tilingParams.m / tilingParams.mc - val numComputeTilesN = tilingParams.n / tilingParams.nc - val tileM = step % numComputeTilesM.U - val tileN = step / numComputeTilesM.U - val mcSubstep = tilingParams.mc / 2 - val ncSubstep = tilingParams.nc / 2 + val tileM = step % numTilesM.U + val tileN = step / numTilesM.U // note that both A and B are K-major to facilitate bank conflict-free SMEM // accesses @@ -180,11 +176,11 @@ class TensorCoreDecoupled( val tileColB = set // K // (row,col) coordinate of the starting element of the compute tile val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) + - (substep << log2Ceil(mcSubstep)) - val elemColA = tileColA << log2Ceil(tilingParams.kc) - val elemRowB = tileRowB << log2Ceil(tilingParams.nc) - (substep << log2Ceil(ncSubstep)) - val elemColB = tileColB << log2Ceil(tilingParams.kc) + (substep << log2Ceil(tilingParams.mc / 2)) + val elemColA = tileColA << log2Ceil(tilingParams.kc) + val elemRowB = (tileRowB << log2Ceil(tilingParams.nc)) + + (substep << log2Ceil(tilingParams.nc / 2)) + val elemColB = tileColB << log2Ceil(tilingParams.kc) val rowStrideA = wordSize * tilingParams.k val rowStrideABits = log2Ceil(rowStrideA) val rowStrideB = wordSize * tilingParams.k @@ -201,6 +197,13 @@ class TensorCoreDecoupled( val (addressA, addressB) = addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess) + val genReqA = (state === TensorState.run) + val numTilesMBits = log2Ceil(numTilesM) + // generate B request at every 4 steps. B achieves reuse through outer + // product so it doesn't require access at every step + val shouldFireB = (stepAccess & ((1 << numTilesMBits) - 1).U) === 0.U + val genReqB = (state === TensorState.run) && shouldFireB + val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), @@ -213,7 +216,7 @@ class TensorCoreDecoupled( sourceGen.io.gen := req.fire sourceGen.io.meta := tag - req.valid := genReq + req.valid := (if (i == 0) genReqA else genReqB) req.bits.address := (if (i == 0) addressA else addressB) req.bits.source := sourceGen.io.id.bits @@ -228,23 +231,27 @@ class TensorCoreDecoupled( } } - // only advance to the next step if we fired mem requests for both A and B - // TODO: @perf: too strict? should be able to have A and B progress - // separately - val firedABReg = RegInit(VecInit(false.B, false.B)) - val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map { - case (req, fired) => { when (req.fire) { fired := true.B } } - req.fire - }) - val firedAB = (firedABNow.asUInt | firedABReg.asUInt) - val nextSubstepAccess = firedAB.andR + // only advance to the next step if we fired mem requests for both A and B. + // also consider that B doesn't have to be fired every time due to reuse. + // @perf: too strict? should be able to have A and B progress separately + val firedAReg = RegInit(false.B) + val firedBReg = RegInit(false.B) + when (io.reqA.fire) { firedAReg := true.B } + when (io.reqB.fire) { firedBReg := true.B } + val firedANow = io.reqA.fire + val firedBNow = io.reqB.fire + val firedA = firedAReg || firedANow + val firedB = firedBReg || firedBNow + val nextSubstepAccess = firedA && (!shouldFireB || firedB) val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U) // clear out firedABReg every substep when (nextSubstepAccess) { - firedABReg := Seq(false.B, false.B) + firedAReg := false.B + firedBReg := false.B substepAccess := substepAccess + 1.U } require(substepAccess.widthOption.get == 1, "there should be only two substeps") + dontTouch(shouldFireB) // Execute stage // ------------- @@ -327,18 +334,26 @@ class TensorCoreDecoupled( respQueueA.ready := MuxCase(false.B, Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready, (substepExecute === 1.U) -> fullAQueue.io.enq.ready)) - respQueueB.ready := dpuFire + // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when + // we fully iterated a column (M-dimension). + val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U + val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask + respQueueB.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) + dontTouch(shouldDequeueB) - // assert that the DPU is computing with operands of the same set/step + // Assert that the DPU is computing with operands of the same set/step. Note + // that the B resp will only have step values multiple of 4 due to reuse. // - // this assumes that memory responses come back in-order. this might be too - // strong an assumption depending on the backing memory + // This check assumes that memory responses come back in-order. Might be too + // strong of an assumption depending on the backing memory. def assertAligned = { + val stepMask = (1 << numTilesMBits).U when (dpuFire) { assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) && - (fullAQueue.io.deq.bits.tag.step === respQueueB.bits.tag.step), + ((fullAQueue.io.deq.bits.tag.step & stepMask) === + (respQueueB.bits.tag.step & stepMask)), "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } @@ -348,26 +363,26 @@ class TensorCoreDecoupled( // Dot-product unit // // 4x2 four-element DPUs summing up to 32 MACs in total - val dpus = Seq.fill(4)(Seq.fill(2)( + val ncSubstep = tilingParams.nc / 2 + val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)( Module(new TensorDotProductUnit(half = false)) )) // operandA is 4x4 in K-major val operandADimensional = operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - .grouped(4).toSeq - assert(operandADimensional.length == tilingParams.mc && - operandADimensional(0).length == tilingParams.kc, - "operand width doesn't agree with tiling parameter") - // operandB is 2x4, i.e. 4x2 in N-major + .grouped(4/*k-dim*/).toSeq + require(operandADimensional.length == tilingParams.mc && + operandADimensional(0).length == tilingParams.kc, + "operand width doesn't agree with tiling parameter") + // operandB is 2x4 in K-major val operandBDimensional = operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - .grouped(4).toSeq - val ncSubstep = tilingParams.nc / 2 - assert(tilingParams.mc * ncSubstep == numLanes, - "substep tile size doesn't match writeback throughput") - assert(operandBDimensional.length == ncSubstep && - operandBDimensional(0).length == tilingParams.kc, - "operand width doesn't agree with tiling parameter") + .grouped(4/*k-dim*/).toSeq + require(tilingParams.mc * ncSubstep == numLanes, + "substep tile size doesn't match writeback throughput") + require(operandBDimensional.length == ncSubstep && + operandBDimensional(0).length == tilingParams.kc, + "operand width doesn't agree with tiling parameter") for (m <- 0 until tilingParams.mc) { for (n <- 0 until ncSubstep) { @@ -406,10 +421,8 @@ class TensorCoreDecoupled( // ---------------- // These queues hold metadata needed for writeback in sync with the DPU. - val queueDepth = 6 // needs to be at least the DPU latency - val tagQueue = Module(new Queue( - chiselTypeOf(operandATag), queueDepth - )) + val queueDepth = 5 // needs to be at least the DPU latency + val tagQueue = Module(new Queue(chiselTypeOf(operandATag), queueDepth)) tagQueue.io.enq.valid := dpuFire // A and B should have the same tags tagQueue.io.enq.bits := operandATag @@ -573,11 +586,11 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { val tensor = LazyModule(new TensorCoreDecoupledTL) val xbar = LazyModule(new TLXbar) val ramA = LazyModule(new TLRAM( - address = AddressSet(0x000, 0xfffeff), + address = AddressSet(0x000, 0xfffbff), beatBytes = 32 // @cleanup: hardcoded )) val ramB = LazyModule(new TLRAM( - address = AddressSet(0x100, 0xfffeff), + address = AddressSet(0x400, 0xfffbff), beatBytes = 32 // @cleanup: hardcoded )) From c2f39f74749df7fac8ba63d8900d6651eea72f71 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 16:21:43 -0700 Subject: [PATCH 26/47] tensor: Rename substepExecute --- .../radiance/core/TensorCoreDecoupled.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 897edb2..f7c6c63 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -269,13 +269,13 @@ class TensorCoreDecoupled( require(respQueueA.bits.data.widthOption.get == io.writeback.bits.data.widthOption.get, - "response data width does not match the writeback data width") + "response data width does not match the writeback data width") - val substepExecute = RegInit(0.U(1.W)) + val substepDeqA = RegInit(0.U(1.W)) when (respQueueA.fire) { - substepExecute := substepExecute + 1.U + substepDeqA := substepDeqA + 1.U } - dontTouch(substepExecute) + dontTouch(substepDeqA) // Do pipelining for the A operand so that we obtain the full 4x4 A tile // ready for compute. The pipeline is two-stage: @@ -292,7 +292,7 @@ class TensorCoreDecoupled( val halfAQueue = Module(new Queue( chiselTypeOf(respQueueA.bits), entries = 1, pipe = true )) - halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U) + halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U) halfAQueue.io.enq.bits := respQueueA.bits // substep == 0 data goes to the LSB @@ -305,9 +305,9 @@ class TensorCoreDecoupled( new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true )) // hold first half A data for the first substep - halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) && + halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) && fullAQueue.io.enq.ready - fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) && + fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) && halfAQueue.io.deq.valid fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag @@ -332,8 +332,8 @@ class TensorCoreDecoupled( // respQueueA output arbitrates to either halfAQueue or fullAQueue depending // on the substep respQueueA.ready := MuxCase(false.B, - Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready, - (substepExecute === 1.U) -> fullAQueue.io.enq.ready)) + Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready, + (substepDeqA === 1.U) -> fullAQueue.io.enq.ready)) // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U From 91d9897c277a1f0ab4e678bdd91f21eef7ac380d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 17:17:41 -0700 Subject: [PATCH 27/47] tensor: Write FillBuffer for tile buffering --- .../radiance/core/TensorCoreDecoupled.scala | 50 +++++++++++++++++-- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index f7c6c63..e70e59f 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -5,6 +5,7 @@ package radiance.core import chisel3._ import chisel3.util._ +import chisel3.experimental.requireIsChiselType import org.chipsalliance.cde.config.Parameters import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} import freechips.rocketchip.tilelink._ @@ -312,10 +313,17 @@ class TensorCoreDecoupled( fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag - val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid + val fillBufB = Module(new FillBuffer( + chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ + )) + fillBufB.io.enq.valid := respQueueB.valid + fillBufB.io.enq.bits := respQueueB.bits.data + respQueueB.ready := fillBufB.io.enq.ready + + val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid val operandA = fullAQueue.io.deq.bits.data val operandATag = fullAQueue.io.deq.bits.tag - val operandB = respQueueB.bits.data + val operandB = fillBufB.io.deq.bits val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val setCompute = fullAQueue.io.deq.bits.tag.set @@ -338,7 +346,7 @@ class TensorCoreDecoupled( // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask - respQueueB.ready := dpuFire && shouldDequeueB + fillBufB.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) dontTouch(shouldDequeueB) @@ -375,8 +383,11 @@ class TensorCoreDecoupled( operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") // operandB is 2x4 in K-major + // val operandBDimensional = + // operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + // .grouped(4/*k-dim*/).toSeq val operandBDimensional = - operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq require(tilingParams.mc * ncSubstep == numLanes, "substep tile size doesn't match writeback throughput") @@ -490,6 +501,37 @@ class TensorCoreDecoupled( } } +// A buffer that collects multiple entries of input data and exposes the +// coalesced data as output. Effectively acts as a width-widening +// chisel.util.Pipe. +class FillBuffer[T <: Data]( + gen: T, + entries: Int +) extends Module { + require(entries > 0, "FillBuffer must have a positive number of entries") + requireIsChiselType(gen) + + val io = IO(new Bundle { + val enq = Flipped(Decoupled(gen)) + val deq = Decoupled(Vec(entries, gen)) + }) + + val data = Reg(Vec(entries, gen)) + val ptr = Counter(entries + 1) + val full = (ptr.value === entries.U) + io.enq.ready := !full + when (io.enq.fire) { + data(ptr.value) := io.enq.bits + ptr.inc() + } + io.deq.valid := full + (io.deq.bits zip data).foreach { case (io, d) => io := d } + when (io.deq.fire) { + assert(ptr.value === entries.U, "FillBuffer fired before buffer was full") + ptr.reset() + } +} + // synthesizable unit tests // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy From 7fab6f89ad3e99de20e4aa5972be745c720b1e70 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 17:33:55 -0700 Subject: [PATCH 28/47] tensor: Properly route FillBuffer to DPU --- .../radiance/core/TensorCoreDecoupled.scala | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index e70e59f..206250e 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -313,17 +313,24 @@ class TensorCoreDecoupled( fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag - val fillBufB = Module(new FillBuffer( + // serialize every two B responses into one full 4x4 B tile + // FIXME: do the same for A + val fullB = Module(new FillBuffer( chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ )) - fillBufB.io.enq.valid := respQueueB.valid - fillBufB.io.enq.bits := respQueueB.bits.data - respQueueB.ready := fillBufB.io.enq.ready + fullB.io.enq.valid := respQueueB.valid + fullB.io.enq.bits := respQueueB.bits.data + respQueueB.ready := fullB.io.enq.ready + val fullBTag = Module(new Queue( + new TensorMemTag, entries = 1, pipe = true + )) + fullBTag.io.enq.valid := respQueueB.valid + fullBTag.io.enq.bits := respQueueB.bits.tag - val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid + val operandsValid = fullAQueue.io.deq.valid && fullB.io.deq.valid val operandA = fullAQueue.io.deq.bits.data val operandATag = fullAQueue.io.deq.bits.tag - val operandB = fillBufB.io.deq.bits + val operandB = fullB.io.deq.bits val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val setCompute = fullAQueue.io.deq.bits.tag.set @@ -333,10 +340,6 @@ class TensorCoreDecoupled( substepCompute := substepCompute + 1.U } - // hold full A until two-cycle compute is done - fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) - val nextStepExecute = dpuFire && (substepCompute === 1.U) - // respQueueA output arbitrates to either halfAQueue or fullAQueue depending // on the substep respQueueA.ready := MuxCase(false.B, @@ -345,12 +348,19 @@ class TensorCoreDecoupled( // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U - val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask - fillBufB.io.deq.ready := dpuFire && shouldDequeueB + val shouldDequeueB = + ((stepExecute & shouldDequeueBMask) === shouldDequeueBMask) && + (substepCompute === 1.U) + fullB.io.deq.ready := dpuFire && shouldDequeueB + fullBTag.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) dontTouch(shouldDequeueB) + // hold full A until two-cycle compute is done + fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) + val nextStepExecute = dpuFire && (substepCompute === 1.U) + // Assert that the DPU is computing with operands of the same set/step. Note // that the B resp will only have step values multiple of 4 due to reuse. // @@ -359,9 +369,9 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) && + assert((fullAQueue.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) && ((fullAQueue.io.deq.bits.tag.step & stepMask) === - (respQueueB.bits.tag.step & stepMask)), + (fullBTag.io.deq.bits.step & stepMask)), "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } @@ -387,7 +397,7 @@ class TensorCoreDecoupled( // operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq // .grouped(4/*k-dim*/).toSeq val operandBDimensional = - operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq require(tilingParams.mc * ncSubstep == numLanes, "substep tile size doesn't match writeback throughput") From c4b5a11fdefbbfbe73b765bb1feece25d2a1d3f1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 19:54:20 -0700 Subject: [PATCH 29/47] tensor: Replace staging logic for A with FillBuffer --- .../radiance/core/TensorCoreDecoupled.scala | 77 ++++++++----------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 206250e..deb4dc1 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -272,46 +272,41 @@ class TensorCoreDecoupled( io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") + // FIXME: unnecessary val substepDeqA = RegInit(0.U(1.W)) when (respQueueA.fire) { substepDeqA := substepDeqA + 1.U } dontTouch(substepDeqA) - // Do pipelining for the A operand so that we obtain the full 4x4 A tile - // ready for compute. The pipeline is two-stage: - // - stage one (halfAQueue) for assembling the full A tile from half-tiles - // coming from the resp queue, and - // - stage two (fullAQueue) for holding the full A tile until it gets - // matched with two 4x2 B tiles, and compute is complete. - // - // Note that the half-tile assembly is unnecessary for B since the B tile is - // only 4x2. - // Also send the set/step tag along the pipe for alignment check. + // Stage the operands in a pipeline so that we obtain the full 4x4 tiles + // ready for compute. Also send the set/step tag along the pipe for + // alignment check. - // note combinationally coupled ready with `pipe` - val halfAQueue = Module(new Queue( - chiselTypeOf(respQueueA.bits), entries = 1, pipe = true + val fullA = Module(new FillBuffer( + chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ )) - halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U) - halfAQueue.io.enq.bits := respQueueA.bits + fullA.io.enq.valid := respQueueA.valid + fullA.io.enq.bits := respQueueA.bits.data + respQueueA.ready := fullA.io.enq.ready + // `pipe` combinationally couples enq-deq ready + val fullATag = Module(new Queue( + new TensorMemTag, entries = 1, pipe = true + )) + fullATag.io.enq.valid := respQueueA.valid + fullATag.io.enq.bits := respQueueA.bits.tag - // substep == 0 data goes to the LSB - val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits.data) - require(fullAEnqData.widthOption.get == dataWidth * 2, - "assumes 2-cycle read for a full compute tile of A") - // only use the lower halfA's tag. substep will be incorrect. - val fullAEnqTag = halfAQueue.io.deq.bits.tag - val fullAQueue = Module(new Queue( + // stage the full A tile once more so that FillBuffer can be filled up in the + // background while the tile is being used for compute. This does come with + // capacity overhead. + val fullABuf = Module(new Queue( new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true )) - // hold first half A data for the first substep - halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) && - fullAQueue.io.enq.ready - fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) && - halfAQueue.io.deq.valid - fullAQueue.io.enq.bits.data := fullAEnqData - fullAQueue.io.enq.bits.tag := fullAEnqTag + fullABuf.io.enq.valid := fullA.io.deq.valid + fullABuf.io.enq.bits.data := fullA.io.deq.bits.asUInt + fullABuf.io.enq.bits.tag := fullATag.io.deq.bits + fullA.io.deq.ready := fullABuf.io.enq.ready + fullATag.io.deq.ready := fullABuf.io.enq.ready // serialize every two B responses into one full 4x4 B tile // FIXME: do the same for A @@ -327,29 +322,24 @@ class TensorCoreDecoupled( fullBTag.io.enq.valid := respQueueB.valid fullBTag.io.enq.bits := respQueueB.bits.tag - val operandsValid = fullAQueue.io.deq.valid && fullB.io.deq.valid - val operandA = fullAQueue.io.deq.bits.data - val operandATag = fullAQueue.io.deq.bits.tag + val operandsValid = fullABuf.io.deq.valid && fullB.io.deq.valid + val operandA = fullABuf.io.deq.bits.data + val operandATag = fullABuf.io.deq.bits.tag val operandB = fullB.io.deq.bits val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady - val setCompute = fullAQueue.io.deq.bits.tag.set - val stepCompute = fullAQueue.io.deq.bits.tag.step + val setCompute = fullABuf.io.deq.bits.tag.set + val stepCompute = fullABuf.io.deq.bits.tag.step val substepCompute = RegInit(0.U(1.W)) when (dpuFire) { substepCompute := substepCompute + 1.U } - // respQueueA output arbitrates to either halfAQueue or fullAQueue depending - // on the substep - respQueueA.ready := MuxCase(false.B, - Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready, - (substepDeqA === 1.U) -> fullAQueue.io.enq.ready)) // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = - ((stepExecute & shouldDequeueBMask) === shouldDequeueBMask) && + ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) && (substepCompute === 1.U) fullB.io.deq.ready := dpuFire && shouldDequeueB fullBTag.io.deq.ready := dpuFire && shouldDequeueB @@ -358,7 +348,8 @@ class TensorCoreDecoupled( dontTouch(shouldDequeueB) // hold full A until two-cycle compute is done - fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) + fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U) + // FIXME: this should be nextStepCompute val nextStepExecute = dpuFire && (substepCompute === 1.U) // Assert that the DPU is computing with operands of the same set/step. Note @@ -369,8 +360,8 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert((fullAQueue.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) && - ((fullAQueue.io.deq.bits.tag.step & stepMask) === + assert((fullABuf.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) && + ((fullABuf.io.deq.bits.tag.step & stepMask) === (fullBTag.io.deq.bits.step & stepMask)), "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") From 93c9bcc32f5b516f3bd51990ff60e22e0348f409 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 20:12:15 -0700 Subject: [PATCH 30/47] tensor: Stage B as well for full throughput --- .../radiance/core/TensorCoreDecoupled.scala | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index deb4dc1..90cb785 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -300,10 +300,13 @@ class TensorCoreDecoupled( // background while the tile is being used for compute. This does come with // capacity overhead. val fullABuf = Module(new Queue( - new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true + new Bundle { + val data = chiselTypeOf(fullA.io.deq.bits) + val tag = new TensorMemTag + }, entries = 1, pipe = true )) fullABuf.io.enq.valid := fullA.io.deq.valid - fullABuf.io.enq.bits.data := fullA.io.deq.bits.asUInt + fullABuf.io.enq.bits.data := fullA.io.deq.bits fullABuf.io.enq.bits.tag := fullATag.io.deq.bits fullA.io.deq.ready := fullABuf.io.enq.ready fullATag.io.deq.ready := fullABuf.io.enq.ready @@ -322,10 +325,22 @@ class TensorCoreDecoupled( fullBTag.io.enq.valid := respQueueB.valid fullBTag.io.enq.bits := respQueueB.bits.tag - val operandsValid = fullABuf.io.deq.valid && fullB.io.deq.valid + val fullBBuf = Module(new Queue( + new Bundle { + val data = chiselTypeOf(fullB.io.deq.bits) + val tag = new TensorMemTag + }, entries = 1, pipe = true + )) + fullBBuf.io.enq.valid := fullB.io.deq.valid + fullBBuf.io.enq.bits.data := fullB.io.deq.bits + fullBBuf.io.enq.bits.tag := fullBTag.io.deq.bits + fullB.io.deq.ready := fullBBuf.io.enq.ready + fullBTag.io.deq.ready := fullBBuf.io.enq.ready + + val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid val operandA = fullABuf.io.deq.bits.data val operandATag = fullABuf.io.deq.bits.tag - val operandB = fullB.io.deq.bits + val operandB = fullBBuf.io.deq.bits.data val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val setCompute = fullABuf.io.deq.bits.tag.set @@ -335,20 +350,19 @@ class TensorCoreDecoupled( substepCompute := substepCompute + 1.U } + // hold full A until two-cycle compute is done + fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U) // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) && (substepCompute === 1.U) - fullB.io.deq.ready := dpuFire && shouldDequeueB - fullBTag.io.deq.ready := dpuFire && shouldDequeueB + fullBBuf.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) dontTouch(shouldDequeueB) - // hold full A until two-cycle compute is done - fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U) // FIXME: this should be nextStepCompute val nextStepExecute = dpuFire && (substepCompute === 1.U) @@ -360,9 +374,9 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert((fullABuf.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) && + assert((fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set) && ((fullABuf.io.deq.bits.tag.step & stepMask) === - (fullBTag.io.deq.bits.step & stepMask)), + (fullBBuf.io.deq.bits.tag.step & stepMask)), "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } @@ -378,15 +392,12 @@ class TensorCoreDecoupled( )) // operandA is 4x4 in K-major val operandADimensional = - operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + operandA.asUInt.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq require(operandADimensional.length == tilingParams.mc && operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") - // operandB is 2x4 in K-major - // val operandBDimensional = - // operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - // .grouped(4/*k-dim*/).toSeq + // select 2x4 subtile out of operandB that is 4x4 in K-major val operandBDimensional = operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq From c0292dd0aa97a8cec3d034b20e2a167f78e54af8 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 21:51:34 -0700 Subject: [PATCH 31/47] tensor: Enlarge operand buffer for A for better SMEM reuse --- .../radiance/core/TensorCoreDecoupled.scala | 159 +++++++++++------- 1 file changed, 100 insertions(+), 59 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 90cb785..fa3f6e9 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -146,18 +146,6 @@ class TensorCoreDecoupled( // Memory traffic generation // ------------------------- // - class TensorMemTag extends Bundle { - val set = UInt(setBits.W) - val step = UInt(stepBits.W) - val substep = UInt(1.W) - } - // use concatenation of set/step as the memory request source. This will get - // translated to the actual TL sourcewidth in sourceGen. - val tag = Wire(new TensorMemTag) - tag.set := setAccess - tag.step := stepAccess - tag.substep := substepAccess - val numTilesM = tilingParams.m / tilingParams.mc val numTilesN = tilingParams.n / tilingParams.nc // @cleanup: generalize in terms of M/N/K-majorness? @@ -198,12 +186,41 @@ class TensorCoreDecoupled( val (addressA, addressB) = addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess) + // 'index' is the index of a memory request among the sequence of requests + // needed to read a full M-column of A or N-row of B. Its range is [0,m/2) + // or [0,n/2), where 2 is the stride can be read in a single request size. + require(tilingParams.m == tilingParams.n, + "currently only supports square SMEM tile") + val numIndices = tilingParams.m / 2 + val indexBits = log2Ceil(numIndices) + val lastIndex = (1 << indexBits) - 1 + + class TensorMemTag extends Bundle { + val set = UInt(setBits.W) + val index = UInt(indexBits.W) + } + + val tagInit = Wire(new TensorMemTag) + tagInit.set := 0.U + tagInit.index := 0.U + val tagA = RegInit(tagInit) + val tagB = RegInit(tagInit) + + when (io.reqA.fire) { + when (tagA.index === lastIndex.U) { + tagA.set := tagA.set + 1.U + } + tagA.index := tagA.index + 1.U + } + when (io.reqB.fire) { + when (tagB.index === lastIndex.U) { + tagB.set := tagB.set + 1.U + } + tagB.index := tagB.index + 1.U + } + val genReqA = (state === TensorState.run) - val numTilesMBits = log2Ceil(numTilesM) - // generate B request at every 4 steps. B achieves reuse through outer - // product so it doesn't require access at every step - val shouldFireB = (stepAccess & ((1 << numTilesMBits) - 1).U) === 0.U - val genReqB = (state === TensorState.run) && shouldFireB + val genReqB = (state === TensorState.run) val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) @@ -212,11 +229,11 @@ class TensorCoreDecoupled( case ((req, (resp, respTagged)), i) => { val sourceGen = Module(new SourceGenerator( log2Ceil(numSourceIds), - metadata = Some(tag) + metadata = Some(new TensorMemTag) )) sourceGen.io.gen := req.fire - sourceGen.io.meta := tag + sourceGen.io.meta := (if (i == 0) tagA else tagB) req.valid := (if (i == 0) genReqA else genReqB) req.bits.address := (if (i == 0) addressA else addressB) req.bits.source := sourceGen.io.id.bits @@ -243,7 +260,7 @@ class TensorCoreDecoupled( val firedBNow = io.reqB.fire val firedA = firedAReg || firedANow val firedB = firedBReg || firedBNow - val nextSubstepAccess = firedA && (!shouldFireB || firedB) + val nextSubstepAccess = firedA && firedB val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U) // clear out firedABReg every substep when (nextSubstepAccess) { @@ -252,17 +269,12 @@ class TensorCoreDecoupled( substepAccess := substepAccess + 1.U } require(substepAccess.widthOption.get == 1, "there should be only two substeps") - dontTouch(shouldFireB) // Execute stage // ------------- // Backend of the decoupled access/execute pipeline. // // set and step being currently executed in the acc/ex backend - val setExecute = RegInit(0.U(setBits.W)) - val stepExecute = RegInit(0.U(stepBits.W)) - dontTouch(setExecute) - dontTouch(stepExecute) val respQueueDepth = 4 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) @@ -283,8 +295,10 @@ class TensorCoreDecoupled( // ready for compute. Also send the set/step tag along the pipe for // alignment check. + // @cleanup: dedup A and B below + val fullA = Module(new FillBuffer( - chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ + chiselTypeOf(respQueueB.bits.data), numIndices )) fullA.io.enq.valid := respQueueA.valid fullA.io.enq.bits := respQueueA.bits.data @@ -337,23 +351,48 @@ class TensorCoreDecoupled( fullB.io.deq.ready := fullBBuf.io.enq.ready fullBTag.io.deq.ready := fullBBuf.io.enq.ready - val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid - val operandA = fullABuf.io.deq.bits.data - val operandATag = fullABuf.io.deq.bits.tag - val operandB = fullBBuf.io.deq.bits.data val dpuReady = Wire(Bool()) + val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid val dpuFire = operandsValid && dpuReady - val setCompute = fullABuf.io.deq.bits.tag.set - val stepCompute = fullABuf.io.deq.bits.tag.step + + val setCompute = RegInit(0.U(setBits.W)) + val stepCompute = RegInit(0.U(stepBits.W)) val substepCompute = RegInit(0.U(1.W)) + val nextStepCompute = dpuFire && (substepCompute === 1.U) + dontTouch(setCompute) + dontTouch(stepCompute) + dontTouch(substepCompute) when (dpuFire) { substepCompute := substepCompute + 1.U } - // hold full A until two-cycle compute is done - fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U) - // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when - // we fully iterated a column (M-dimension). + // Operand selection + // + // select the correct 4x4 tile from A operand buffer + val numTilesMBits = log2Ceil(numTilesM) + def selectOperandA(buf: Vec[UInt]): UInt = { + require(buf.length == numIndices) + val stepM = stepCompute & ((1 << numTilesMBits) - 1).U + Cat(buf((stepM << 1) + 1.U), buf(stepM << 1)) + } + val operandA = selectOperandA(fullABuf.io.deq.bits.data) + val operandATag = fullABuf.io.deq.bits.tag + // select the correct 2x4 tile from B operand buffer + val operandB = fullBBuf.io.deq.bits.data(substepCompute) + val operandBTag = fullBBuf.io.deq.bits.tag + dontTouch(operandATag) + dontTouch(operandBTag) + + // Operand buffer dequeue logic + // + // hold A data until the entire set is done + val shouldDequeueAMask = ((1 << stepBits) - 1).U + val shouldDequeueA = + ((stepCompute & shouldDequeueAMask) === shouldDequeueAMask) && + (substepCompute === 1.U) + fullABuf.io.deq.ready := dpuFire && shouldDequeueA + // hold B tile at respQueueB for multiple steps for reuse, only dequeue when + // we fully iterated a column (M-dimension) val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) && @@ -361,11 +400,9 @@ class TensorCoreDecoupled( fullBBuf.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) + dontTouch(shouldDequeueA) dontTouch(shouldDequeueB) - // FIXME: this should be nextStepCompute - val nextStepExecute = dpuFire && (substepCompute === 1.U) - // Assert that the DPU is computing with operands of the same set/step. Note // that the B resp will only have step values multiple of 4 due to reuse. // @@ -374,11 +411,9 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert((fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set) && - ((fullABuf.io.deq.bits.tag.step & stepMask) === - (fullBBuf.io.deq.bits.tag.step & stepMask)), - "A and B operands are pointing to different set/steps. " ++ - "This might indicate memory response coming back out-of-order.") + assert(fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set, + "A and B operands are pointing to different sets. " ++ + "This might indicate memory response coming back out-of-order.") } } assertAligned @@ -386,23 +421,24 @@ class TensorCoreDecoupled( // Dot-product unit // // 4x2 four-element DPUs summing up to 32 MACs in total + // val ncSubstep = tilingParams.nc / 2 + require(tilingParams.mc * ncSubstep == numLanes, + "substep tile size doesn't match writeback throughput") val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)( Module(new TensorDotProductUnit(half = false)) )) - // operandA is 4x4 in K-major - val operandADimensional = - operandA.asUInt.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - .grouped(4/*k-dim*/).toSeq + + // reshape operands for easier routing to DPU + def reshapeByFourWords(x: UInt): Seq[Seq[UInt]] = { + x.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + .grouped(4/*k-dim*/).toSeq + } + val operandADimensional = reshapeByFourWords(operandA) require(operandADimensional.length == tilingParams.mc && operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") - // select 2x4 subtile out of operandB that is 4x4 in K-major - val operandBDimensional = - operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - .grouped(4/*k-dim*/).toSeq - require(tilingParams.mc * ncSubstep == numLanes, - "substep tile size doesn't match writeback throughput") + val operandBDimensional = reshapeByFourWords(operandB) require(operandBDimensional.length == ncSubstep && operandBDimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") @@ -444,12 +480,17 @@ class TensorCoreDecoupled( // ---------------- // These queues hold metadata needed for writeback in sync with the DPU. + class TensorComputeTag extends Bundle { + val set = UInt(setBits.W) + val step = UInt(stepBits.W) + val substep = UInt(1.W) + } + val queueDepth = 5 // needs to be at least the DPU latency - val tagQueue = Module(new Queue(chiselTypeOf(operandATag), queueDepth)) + val tagQueue = Module(new Queue(new TensorComputeTag, queueDepth)) tagQueue.io.enq.valid := dpuFire - // A and B should have the same tags - tagQueue.io.enq.bits := operandATag - // @cleanup: awkward + tagQueue.io.enq.bits.set := setCompute + tagQueue.io.enq.bits.step := stepCompute tagQueue.io.enq.bits.substep := substepCompute tagQueue.io.deq.ready := io.writeback.fire assert(tagQueue.io.enq.ready === true.B, @@ -490,7 +531,7 @@ class TensorCoreDecoupled( } } sequenceSetStep(setAccess, stepAccess, nextStepAccess) - sequenceSetStep(setExecute, stepExecute, nextStepExecute) + sequenceSetStep(setCompute, stepCompute, nextStepCompute) switch(state) { is(TensorState.idle) { From 0aadc6074ad32fccb13118f8e0915d8b76f2a267 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 22:42:41 -0700 Subject: [PATCH 32/47] tensor: Decouple A and B access states Get rid of set/stepAccess states and let A and B access progress independently. --- .../radiance/core/TensorCoreDecoupled.scala | 200 ++++++++---------- 1 file changed, 88 insertions(+), 112 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index fa3f6e9..ed241b5 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -82,15 +82,6 @@ class TensorCoreDecoupled( // This drives the overall pipeline of memory requests, dot-product unit // operations and regfile writeback. - object TensorState extends ChiselEnum { - val idle = Value(0.U) - val run = Value(1.U) - // All set/step sequencing is complete and the tensor core is holding the - // result data until downstream writeback is ready. - // FIXME: is this necessary if writeback is decoupled with queues? - val finish = Value(2.U) - } - val state = RegInit(TensorState.idle) val busy = RegInit(false.B) // Holds the warp id the core is currently working on. Note that we only // support one outstanding warp request @@ -107,22 +98,10 @@ class TensorCoreDecoupled( def setDone(set: UInt) = (set === lastSet.U) def stepDone(step: UInt) = (step === lastStep.U) - // set and step being currently accessed in the acc/ex frontend - val setAccess = RegInit(0.U(setBits.W)) - val stepAccess = RegInit(0.U(stepBits.W)) - // we need full 4x4 A tile to fire DPU, but since the memory width is 8 - // words, we need 2 cycles to read A. `substep` tells which cycle we're at. - val substepAccess = RegInit(0.U(1.W)) - dontTouch(setAccess) - dontTouch(stepAccess) - dontTouch(substepAccess) - - when(io.initiate.fire) { + when (io.initiate.fire) { val wid = io.initiate.bits.wid busy := true.B warpReg := wid - setAccess := 0.U - stepAccess := 0.U when(io.writeback.fire) { assert( io.writeback.bits.wid =/= wid, @@ -143,55 +122,51 @@ class TensorCoreDecoupled( // serialize every HGMMA request io.initiate.ready := !busy - // Memory traffic generation - // ------------------------- + // =========================================================================== + // Access stage + // =========================================================================== // - val numTilesM = tilingParams.m / tilingParams.mc - val numTilesN = tilingParams.n / tilingParams.nc - // @cleanup: generalize in terms of M/N/K-majorness? - def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt) - : (UInt/*A*/, UInt/*B*/) = { - // note that step iterates along N first, then M - val tileM = step % numTilesM.U - val tileN = step / numTilesM.U + // Frontend of the decoupled access/execute pipeline. - // note that both A and B are K-major to facilitate bank conflict-free SMEM - // accesses - // - // (row,col) coordinate of the compute tile - val tileRowA = tileM // M - val tileColA = set // K - val tileRowB = tileN // N - val tileColB = set // K - // (row,col) coordinate of the starting element of the compute tile - val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) + - (substep << log2Ceil(tilingParams.mc / 2)) - val elemColA = tileColA << log2Ceil(tilingParams.kc) - val elemRowB = (tileRowB << log2Ceil(tilingParams.nc)) + - (substep << log2Ceil(tilingParams.nc / 2)) - val elemColB = tileColB << log2Ceil(tilingParams.kc) - val rowStrideA = wordSize * tilingParams.k - val rowStrideABits = log2Ceil(rowStrideA) - val rowStrideB = wordSize * tilingParams.k - val rowStrideBBits = log2Ceil(rowStrideB) - val wordStrideBits = log2Ceil(wordSize) - - val tileOffsetA = (elemRowA << rowStrideABits) + (elemColA << wordStrideBits) - val tileOffsetB = (elemRowB << rowStrideBBits) + (elemColB << wordStrideBits) - - (baseA + tileOffsetA, baseB + tileOffsetB) + // States + // + object AccessorState extends ChiselEnum { + val idle = Value(0.U) + val access = Value(1.U) + // All set/step sequencing is complete and the tensor core is holding the + // result data until downstream writeback is ready. + // FIXME: is this necessary if writeback is decoupled with queues? + val finish = Value(2.U) } + val state = RegInit(AccessorState.idle) + val allReqsDone = WireInit(false.B) + dontTouch(allReqsDone) - // FIXME: bogus base address - val (addressA, addressB) = - addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess) + switch(state) { + is(AccessorState.idle) { + when(io.initiate.fire) { + state := AccessorState.access + } + } + is(AccessorState.access) { + when (allReqsDone) { + state := AccessorState.finish + } + } + is(AccessorState.finish) { + // FIXME: decouple writeback + when(io.writeback.fire) { + state := AccessorState.idle + } + } + } // 'index' is the index of a memory request among the sequence of requests // needed to read a full M-column of A or N-row of B. Its range is [0,m/2) // or [0,n/2), where 2 is the stride can be read in a single request size. require(tilingParams.m == tilingParams.n, "currently only supports square SMEM tile") - val numIndices = tilingParams.m / 2 + val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/ val indexBits = log2Ceil(numIndices) val lastIndex = (1 << indexBits) - 1 @@ -219,9 +194,51 @@ class TensorCoreDecoupled( tagB.index := tagB.index + 1.U } - val genReqA = (state === TensorState.run) - val genReqB = (state === TensorState.run) + // Address generation + // + def addressGen(base: UInt, set: UInt, index: UInt): UInt = { + // note that both A and B are K-major to facilitate bank conflict-free SMEM + // accesses, so that below code applies to both. + // + // (row,col) coordinate of the compute tile + val tileRow = index + val tileCol = set + // (row,col) coordinate of the starting element of the compute tile + val elemRow = index << 1 + val elemCol = tileCol << log2Ceil(tilingParams.kc) + val rowStride = tilingParams.k * wordSize + val rowStrideBits = log2Ceil(rowStride) + val wordStrideBits = log2Ceil(wordSize) + val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits) + base + tileOffset + } + + // FIXME: bogus base address + val addressA = addressGen(0.U, tagA.set, tagA.index) + val addressB = addressGen(0.U, tagB.set, tagB.index) + + val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U) + val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U) + val doneReqA = RegInit(false.B) + val doneReqB = RegInit(false.B) + when (lastReqA && io.reqA.fire) { doneReqA := true.B } + when (lastReqB && io.reqB.fire) { doneReqB := true.B } + val genReqA = (state === AccessorState.access) && !doneReqA + val genReqB = (state === AccessorState.access) && !doneReqA + when (state === AccessorState.finish) { + doneReqA := false.B + doneReqB := false.B + tagA.set := 0.U + tagA.index := 0.U + tagB.set := 0.U + tagB.index := 0.U + } + + allReqsDone := doneReqA && doneReqB + + // Request generation + // val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), @@ -249,34 +266,13 @@ class TensorCoreDecoupled( } } - // only advance to the next step if we fired mem requests for both A and B. - // also consider that B doesn't have to be fired every time due to reuse. - // @perf: too strict? should be able to have A and B progress separately - val firedAReg = RegInit(false.B) - val firedBReg = RegInit(false.B) - when (io.reqA.fire) { firedAReg := true.B } - when (io.reqB.fire) { firedBReg := true.B } - val firedANow = io.reqA.fire - val firedBNow = io.reqB.fire - val firedA = firedAReg || firedANow - val firedB = firedBReg || firedBNow - val nextSubstepAccess = firedA && firedB - val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U) - // clear out firedABReg every substep - when (nextSubstepAccess) { - firedAReg := false.B - firedBReg := false.B - substepAccess := substepAccess + 1.U - } - require(substepAccess.widthOption.get == 1, "there should be only two substeps") - + // =========================================================================== // Execute stage - // ------------- + // =========================================================================== + // // Backend of the decoupled access/execute pipeline. // - // set and step being currently executed in the acc/ex backend - - val respQueueDepth = 4 // FIXME: parameterize + val respQueueDepth = 8 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) @@ -369,6 +365,7 @@ class TensorCoreDecoupled( // Operand selection // // select the correct 4x4 tile from A operand buffer + val numTilesM = tilingParams.m / tilingParams.mc val numTilesMBits = log2Ceil(numTilesM) def selectOperandA(buf: Vec[UInt]): UInt = { require(buf.length == numIndices) @@ -383,7 +380,7 @@ class TensorCoreDecoupled( dontTouch(operandATag) dontTouch(operandBTag) - // Operand buffer dequeue logic + // Operand buffer logic // // hold A data until the entire set is done val shouldDequeueAMask = ((1 << stepBits) - 1).U @@ -476,8 +473,8 @@ class TensorCoreDecoupled( } io.writeback.bits.data := flattenedDPUOut - // Writeback queues - // ---------------- + // Writeback logic + // // These queues hold metadata needed for writeback in sync with the DPU. class TensorComputeTag extends Bundle { @@ -530,28 +527,7 @@ class TensorCoreDecoupled( } } } - sequenceSetStep(setAccess, stepAccess, nextStepAccess) sequenceSetStep(setCompute, stepCompute, nextStepCompute) - - switch(state) { - is(TensorState.idle) { - when(io.initiate.fire) { - state := TensorState.run - } - } - is(TensorState.run) { - when (setDone(setAccess) && stepDone(stepAccess) && nextStepAccess) { - when (state === TensorState.run) { - state := TensorState.finish - } - } - } - is(TensorState.finish) { - when(io.writeback.fire) { - state := TensorState.idle - } - } - } } // A buffer that collects multiple entries of input data and exposes the From e946403d7863abaec61b4baa92da467617d3fe66 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 22:54:48 -0700 Subject: [PATCH 33/47] tensor: Fix typo, reduce resp queue depth --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index ed241b5..b899ce9 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -225,7 +225,7 @@ class TensorCoreDecoupled( when (lastReqA && io.reqA.fire) { doneReqA := true.B } when (lastReqB && io.reqB.fire) { doneReqB := true.B } val genReqA = (state === AccessorState.access) && !doneReqA - val genReqB = (state === AccessorState.access) && !doneReqA + val genReqB = (state === AccessorState.access) && !doneReqB when (state === AccessorState.finish) { doneReqA := false.B doneReqB := false.B @@ -272,7 +272,7 @@ class TensorCoreDecoupled( // // Backend of the decoupled access/execute pipeline. // - val respQueueDepth = 8 // FIXME: parameterize + val respQueueDepth = 2 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) From b3c328b1be7bf924fdddc285fcf5181d8f55c6cf Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 23:11:19 -0700 Subject: [PATCH 34/47] tensor: Assert minimum response queue depth with doc --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index b899ce9..cd3bfa4 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -272,7 +272,13 @@ class TensorCoreDecoupled( // // Backend of the decoupled access/execute pipeline. // - val respQueueDepth = 2 // FIXME: parameterize + val respQueueDepth = 4 // FIXME: parameterize + require(respQueueDepth >= 4, + "respQueueDepth must be at least 4. This is because the B operand buffer " ++ + "is shallower than A's, so the B response queue has to be deep enough to " ++ + "hold younger requests until A operand buffer becomes valid and the first DPU " ++ + "fire can happen. FIXME: make operand buffer report per-subtile valid so " ++ + "the first compute can happen earlier.") val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) @@ -547,6 +553,7 @@ class FillBuffer[T <: Data]( val data = Reg(Vec(entries, gen)) val ptr = Counter(entries + 1) + dontTouch(ptr.value) val full = (ptr.value === entries.U) io.enq.ready := !full when (io.enq.fire) { From a98cb32343810994737e60446c7b0c5d975a6f37 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 21 Oct 2024 21:56:36 -0700 Subject: [PATCH 35/47] tensor: Inject stalls to A ram for fuzzing --- .../radiance/core/TensorCoreDecoupled.scala | 26 +++++++++++++++++-- .../scala/radiance/memory/Coalescing.scala | 3 ++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index cd3bfa4..c53ab81 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -216,7 +216,7 @@ class TensorCoreDecoupled( // FIXME: bogus base address val addressA = addressGen(0.U, tagA.set, tagA.index) - val addressB = addressGen(0.U, tagB.set, tagB.index) + val addressB = addressGen(0x400.U, tagB.set, tagB.index) val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U) val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U) @@ -672,14 +672,36 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { beatBytes = 32 // @cleanup: hardcoded )) + val stutter = new TLIdentityNode xbar.node :=* tensor.node - ramA.node := xbar.node + ramA.node := stutter := xbar.node ramB.node := xbar.node + val fuzz = true + lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { tensor.module.io.start := io.start io.finished := tensor.module.io.finished + + val (tlIn, _) = stutter.in(0) + val (tlOut, _) = stutter.out(0) + require(stutter.in.length == 1) + require(stutter.out.length == 1) + + // inject stalls for fuzzing + val incr = Wire(Bool()) + val (count, _) = Counter(incr, 0x1000) + def cond(x: UInt) = (x & ((1 << 3) - 1).U) =/= 0.U + val stall = if (fuzz) cond(count) else false.B + + tlOut.a <> tlIn.a + tlIn.d <> tlOut.d + incr := tlIn.a.fire || stall + when (stall) { + tlIn.a.ready := false.B + tlOut.a.valid := false.B + } } } diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index cac5e95..a21daee 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -372,7 +372,8 @@ class SourceGenerator[T <: Data]( outstanding := outstanding + 1.U } }.elsewhen(io.reclaim.valid) { - assert(outstanding > 0.U) + assert(outstanding > 0.U, + "Over-reclaim. Did some responses get dropped?") outstanding := outstanding - 1.U } dontTouch(outstanding) From 408888ae8f0f05364412ddba8246d9adf7502f87 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 21 Oct 2024 22:38:29 -0700 Subject: [PATCH 36/47] tensor: addPath()s for hopper generated chisel FIXME: SourceGenerator has a name-clash. --- src/main/scala/radiance/tile/VortexCore.scala | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index d6561e3..26c6989 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -128,7 +128,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) "NUM_THREADS" -> tile.numLsuLanes ) ) - with HasBlackBoxResource { + with HasBlackBoxResource with HasBlackBoxPath { // addResource("/vsrc/vortex/hw/unit_tests/generic_queue/testbench.v") // addResource("/vsrc/vortex/hw/unit_tests/VX_divide_tb.v") // addResource("/vsrc/vortex/hw/syn/synopsys/models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0_rtl.v") @@ -398,6 +398,34 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv") // addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh") + def addHopperTensorCore = { + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRawFN.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRecFN.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/DotProductPipe.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer_1.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/metadataTable_4x5.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/MulFullRawFN.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/occupancyTable_4x1.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon_1.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorMemTag.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue4_TensorMemRespWithTag.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue5_TensorComputeTag.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_4x261.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_5x7.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is26_oe8_os24.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is47_oe8_os24.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundRawFNToRecFN_e8_s24.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SimpleTimer.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SourceGenerator.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_1.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_2.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorCoreDecoupled.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorDotProductUnit.sv") + } + addHopperTensorCore addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv") addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv") From 0fe2b3b07e5a5210cdb1cb5f92f68596b92ff6fb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 21 Oct 2024 22:39:28 -0700 Subject: [PATCH 37/47] Bump vortex --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 4dcbc31..0f06afc 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 4dcbc31a88915fff35ccefd00c6e753fa5ef135a +Subproject commit 0f06afc3ef7350e82c008f5f25395abf89879213 From e705e8557fda3a3af2765cf8477853e5ca078c92 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 14:32:53 -0700 Subject: [PATCH 38/47] Fake tensor core at RadianceTile for Verilog unique-ification --- src/main/scala/radiance/tile/RadianceTile.scala | 10 ++++++++++ src/main/scala/radiance/tile/VortexCore.scala | 3 +-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 36aef41..18ed1d1 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -379,6 +379,12 @@ class RadianceTile private ( tlMasterXbar.node :=* AddressOrNode(base) :=* dcacheNode } + // Instantiate a fake TensorCoreDecoupled module to force unique-ification of + // module names in the Chisel-generated Verilog. This should be disabled for + // synthesis runs + val tensor = LazyModule(new radiance.core.TensorCoreDecoupledTL) + tlMasterXbar.node :=* tensor.node + /* below are copied from rocket */ val tile_master_blocker = @@ -839,6 +845,10 @@ class RadianceTileModuleImp(outer: RadianceTile) // TODO: generalize for useVxCache if (!outer.radianceParams.useVxCache) {} + // connect io.start and io.finish of the fake TensorCoreDecoupled module to + // prevent optimize-out + outer.tensor.module.io.start := true.B + // // RoCC // if (outer.roccs.size > 0) { // val (respArb, cmdRouter) = { diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index 9ad4be0..a24dc02 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -242,8 +242,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv") // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv") - addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv") - addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv") @@ -408,6 +406,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // tensor core addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv") + addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv") // addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh") addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv") From c613341a778e1a31ffbb39cef31b79f95825ff70 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 15:02:55 -0700 Subject: [PATCH 39/47] Disable addPath for old verilog; Deassert valid for tensor core There's an uncaught TL source bug when the core is busy, which doesn't really need to be fixed with this. --- src/main/scala/radiance/tile/RadianceTile.scala | 5 ++--- src/main/scala/radiance/tile/VortexCore.scala | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 18ed1d1..0dbc3bd 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -845,9 +845,8 @@ class RadianceTileModuleImp(outer: RadianceTile) // TODO: generalize for useVxCache if (!outer.radianceParams.useVxCache) {} - // connect io.start and io.finish of the fake TensorCoreDecoupled module to - // prevent optimize-out - outer.tensor.module.io.start := true.B + // connect io.start and io.finish of the fake TensorCoreDecoupled module + outer.tensor.module.io.start := false.B // // RoCC // if (outer.roccs.size > 0) { diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index ea0a16c..fccfb88 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -435,7 +435,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorCoreDecoupled.sv") addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorDotProductUnit.sv") } - addHopperTensorCore + // addHopperTensorCore addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv") addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv") From 8818fc92034aa4a5bdcdc6380a7e1b00903b46ad Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 16:26:08 -0700 Subject: [PATCH 40/47] tensor: Fix tagWidth for tensor mem io --- src/main/scala/radiance/tile/RadianceTile.scala | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 0dbc3bd..2e235cf 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -739,7 +739,7 @@ class RadianceTileModuleImp(outer: RadianceTile) } } - def connectTc { + def connectTensor = { val tcb0 = new { val addr = core.io.tc_a_bits_address(31, 0) val tag = core.io.tc_a_bits_tag(3, 0) @@ -758,16 +758,18 @@ class RadianceTileModuleImp(outer: RadianceTile) val adapter = Module( new VortexTLAdapter( outer.smemSourceWidth, - new VortexBundleA(tagWidth = 1, dataWidth = 32 * 8), - new VortexBundleD(tagWidth = 1, dataWidth = 32 * 8), + new VortexBundleA(tagWidth = 4, dataWidth = 32 * 8), + new VortexBundleD(tagWidth = 4, dataWidth = 32 * 8), client ) ) + require(adapter.io.inReq.bits.source.widthOption.get == bundle.tag.widthOption.get) + require(adapter.io.inReq.bits.address.widthOption.get == bundle.addr.widthOption.get) adapter.io.inReq.bits <> DontCare adapter.io.inReq.valid := bundle.aValid adapter.io.inReq.bits.address := bundle.addr adapter.io.inReq.bits.source := bundle.tag - adapter.io.inReq.bits.size := 5.U + adapter.io.inReq.bits.size := 5.U // 256 bits adapter.io.inReq.bits.opcode := TLMessages.Get adapter.io.inReq.bits.mask := x"ffffffff".U adapter.io.inResp.ready := bundle.dReady @@ -780,6 +782,8 @@ class RadianceTileModuleImp(outer: RadianceTile) core.io.tc_d_valid := Cat(adapters.last.io.inResp.valid, adapters.head.io.inResp.valid) core.io.tc_d_bits_data := Cat(adapters.last.io.inResp.bits.data, adapters.head.io.inResp.bits.data) core.io.tc_d_bits_tag := Cat(adapters.last.io.inResp.bits.source, adapters.head.io.inResp.bits.source) + require(core.io.tc_d_bits_data.widthOption.get == adapters.head.io.inResp.bits.data.widthOption.get * 2) + require(core.io.tc_d_bits_tag.widthOption.get == adapters.head.io.inResp.bits.source.widthOption.get * 2) } def connectBarrier = { @@ -796,7 +800,7 @@ class RadianceTileModuleImp(outer: RadianceTile) outer.barrierMasterNode.out(0)._1.resp.ready := true.B } - def connectAccelerator: Unit = { + def connectAccelerator = { outer.accMasterNode.out.head._1.cmd.bits := core.io.acc_write_out outer.accMasterNode.out.head._1.cmd.valid := core.io.acc_write_en core.io.acc_read_in := outer.accMasterNode.out.head._1.status @@ -837,7 +841,7 @@ class RadianceTileModuleImp(outer: RadianceTile) connectImem connectDmem connectSmem - connectTc + connectTensor connectBarrier connectAccelerator } From 54ce0f7c34083a1a59a88118f74539d16bc68142 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 17:08:38 -0700 Subject: [PATCH 41/47] tensor: Increase numSourceId to 16 to match RadianceTile --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index c53ab81..040fa4a 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -573,20 +573,20 @@ class FillBuffer[T <: Data]( // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy // graph. class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule { - val numSrcIds = 4 + val numSourceIds = 16 // node with two edges; one for A and one for B matrix val node = TLClientNode(Seq( TLMasterPortParameters.v2( Seq(TLMasterParameters.v2( name = "TensorCoreDecoupledMatrixANode", - sourceId = IdRange(0, numSrcIds) + sourceId = IdRange(0, numSourceIds) )) ), TLMasterPortParameters.v2( Seq(TLMasterParameters.v2( name = "TensorCoreDecoupledMatrixBNode", - sourceId = IdRange(0, numSrcIds) + sourceId = IdRange(0, numSourceIds) )) ) )) @@ -599,7 +599,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) require(outer.node.out.length == 2/*A and B*/) val tensor = Module(new TensorCoreDecoupled( - 8, 8, outer.numSrcIds , TensorTilingParams())) + 8, 8, outer.numSourceIds , TensorTilingParams())) val wordSize = 4 // @cleanup: hardcoded val zip = Seq((outer.node.out(0), tensor.io.reqA), From b566748bcb2b823c380f2101ab65714db15f8dde Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 17:09:21 -0700 Subject: [PATCH 42/47] tensor: Address gen for block-wise contiguous layout Necessary to meet 32B-alignment requirement for SMEM. --- .../radiance/core/TensorCoreDecoupled.scala | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 040fa4a..ff7f94c 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -200,22 +200,30 @@ class TensorCoreDecoupled( // note that both A and B are K-major to facilitate bank conflict-free SMEM // accesses, so that below code applies to both. // - // (row,col) coordinate of the compute tile - val tileRow = index - val tileCol = set - // (row,col) coordinate of the starting element of the compute tile - val elemRow = index << 1 - val elemCol = tileCol << log2Ceil(tilingParams.kc) - val rowStride = tilingParams.k * wordSize - val rowStrideBits = log2Ceil(rowStride) - val wordStrideBits = log2Ceil(wordSize) - val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits) + // a "block" is the 4*8 byte-sized contiguous memory that can be read in + // one SMEM request. The A and B matrix is assumed to be stored in + // block-wise "index"-major order (M-major for A, N-major for B) + val blockRow = set + val blockCol = index + val blockIndex = (blockRow << indexBits) + blockCol + val blockSize = numLanes * wordSize + val blockSizeBits = log2Ceil(blockSize) + val byteOffset = blockIndex << blockSizeBits + base + byteOffset - base + tileOffset + // address generation for byte-wise K-major A and B layout + // val elemRow = blockRow << 1 + // val elemCol = blockCol << log2Ceil(tilingParams.kc) + // val rowStride = tilingParams.k * wordSize + // val rowStrideBits = log2Ceil(rowStride) + // val wordStrideBits = log2Ceil(wordSize) + // val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits) + // base + tileOffset } // FIXME: bogus base address val addressA = addressGen(0.U, tagA.set, tagA.index) + // SMEM 256KB, 8 banks: 0x8000B(32KB) per bank val addressB = addressGen(0x400.U, tagB.set, tagB.index) val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U) From 85eb5e334ff54624587b1869dac3dbcb0f0d1b94 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 17:47:54 -0700 Subject: [PATCH 43/47] Bump vortex --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 32ccdee..3abaaff 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 32ccdeef0154ca9bd747d1a5c2d6d0203e80caf2 +Subproject commit 3abaaff16ffe4deaf7a44043eab6da92e8afe91b From 0a682fb6eff633a638d89c63ce06b786c552b517 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 17:55:14 -0700 Subject: [PATCH 44/47] tensor: dontTouch TensorDPU io Prevents bits.c from being optimized out and set to Z in TensorCoreDecoupled. --- src/main/scala/radiance/core/TensorDPU.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala index 515b1bf..db98b36 100644 --- a/src/main/scala/radiance/core/TensorDPU.scala +++ b/src/main/scala/radiance/core/TensorDPU.scala @@ -33,6 +33,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar val data = Bits((outFLen).W) }) }) + dontTouch(io) // [IEEE] -> recode() -> unbox() -> [Hardfloat] -> box() -> ieee() -> [IEEE] // make sure recoding/uncoding happens only at the edge, not at every From 072904a82be7b6754a347d5a055a45b1ea477ac7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 22:06:24 -0700 Subject: [PATCH 45/47] Bump vortex --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 3abaaff..78df981 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 3abaaff16ffe4deaf7a44043eab6da92e8afe91b +Subproject commit 78df981366778e394e4db62bfdc14c916ddc9f62 From 95ecc5180fae4a462fc4280b82ff83d6e4f9c65e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 22:44:33 -0700 Subject: [PATCH 46/47] tensor: Decouple warp in execute from access This allows the access stage to accept new initiate back-to-back without waiting for the previous writeback to finish. --- .../radiance/core/TensorCoreDecoupled.scala | 73 ++++++++++++------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index ff7f94c..ae763c6 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -85,7 +85,7 @@ class TensorCoreDecoupled( val busy = RegInit(false.B) // Holds the warp id the core is currently working on. Note that we only // support one outstanding warp request - val warpReg = RegInit(0.U(numWarpBits.W)) + val warpAccess = RegInit(0.U(numWarpBits.W)) // sets: k iteration val numSets = (tilingParams.k / tilingParams.kc) @@ -101,7 +101,7 @@ class TensorCoreDecoupled( when (io.initiate.fire) { val wid = io.initiate.bits.wid busy := true.B - warpReg := wid + warpAccess := wid when(io.writeback.fire) { assert( io.writeback.bits.wid =/= wid, @@ -170,28 +170,35 @@ class TensorCoreDecoupled( val indexBits = log2Ceil(numIndices) val lastIndex = (1 << indexBits) - 1 + class State extends Bundle { + val set = UInt(setBits.W) + val index = UInt(indexBits.W) + } class TensorMemTag extends Bundle { + val warp = UInt(numWarpBits.W) val set = UInt(setBits.W) val index = UInt(indexBits.W) } - val tagInit = Wire(new TensorMemTag) - tagInit.set := 0.U - tagInit.index := 0.U - val tagA = RegInit(tagInit) - val tagB = RegInit(tagInit) + val stateInit = Wire(new State) + stateInit.set := 0.U + stateInit.index := 0.U + val stateA = RegInit(stateInit) + val stateB = RegInit(stateInit) + dontTouch(stateA) + dontTouch(stateB) when (io.reqA.fire) { - when (tagA.index === lastIndex.U) { - tagA.set := tagA.set + 1.U + when (stateA.index === lastIndex.U) { + stateA.set := stateA.set + 1.U } - tagA.index := tagA.index + 1.U + stateA.index := stateA.index + 1.U } when (io.reqB.fire) { - when (tagB.index === lastIndex.U) { - tagB.set := tagB.set + 1.U + when (stateB.index === lastIndex.U) { + stateB.set := stateB.set + 1.U } - tagB.index := tagB.index + 1.U + stateB.index := stateB.index + 1.U } // Address generation @@ -222,12 +229,12 @@ class TensorCoreDecoupled( } // FIXME: bogus base address - val addressA = addressGen(0.U, tagA.set, tagA.index) + val addressA = addressGen(0.U, stateA.set, stateA.index) // SMEM 256KB, 8 banks: 0x8000B(32KB) per bank - val addressB = addressGen(0x400.U, tagB.set, tagB.index) + val addressB = addressGen(0x8000.U, stateB.set, stateB.index) - val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U) - val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U) + val lastReqA = (stateA.set === lastSet.U) && (stateA.index === lastIndex.U) + val lastReqB = (stateB.set === lastSet.U) && (stateB.index === lastIndex.U) val doneReqA = RegInit(false.B) val doneReqB = RegInit(false.B) when (lastReqA && io.reqA.fire) { doneReqA := true.B } @@ -237,16 +244,25 @@ class TensorCoreDecoupled( when (state === AccessorState.finish) { doneReqA := false.B doneReqB := false.B - tagA.set := 0.U - tagA.index := 0.U - tagB.set := 0.U - tagB.index := 0.U + stateA.set := 0.U + stateA.index := 0.U + stateB.set := 0.U + stateB.index := 0.U } allReqsDone := doneReqA && doneReqB // Request generation // + val tagA = Wire(new TensorMemTag) + tagA.warp := warpAccess + tagA.set := stateA.set + tagA.index := stateA.index + val tagB = Wire(new TensorMemTag) + tagB.warp := warpAccess + tagB.set := stateB.set + tagB.index := stateB.index + val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), @@ -422,9 +438,12 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert(fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set, - "A and B operands are pointing to different sets. " ++ + assert(operandATag.warp === operandBTag.warp && + operandATag.set === operandBTag.set, + "A and B operands are pointing to different warps and sets. " ++ "This might indicate memory response coming back out-of-order.") + assert(operandATag.set === setCompute, + "Operand arrived from memory is pointing at a different set than the FSM.") } } assertAligned @@ -492,6 +511,7 @@ class TensorCoreDecoupled( // These queues hold metadata needed for writeback in sync with the DPU. class TensorComputeTag extends Bundle { + val warp = UInt(numWarpBits.W) val set = UInt(setBits.W) val step = UInt(stepBits.W) val substep = UInt(1.W) @@ -500,6 +520,7 @@ class TensorCoreDecoupled( val queueDepth = 5 // needs to be at least the DPU latency val tagQueue = Module(new Queue(new TensorComputeTag, queueDepth)) tagQueue.io.enq.valid := dpuFire + tagQueue.io.enq.bits.warp := operandATag.warp tagQueue.io.enq.bits.set := setCompute tagQueue.io.enq.bits.step := stepCompute tagQueue.io.enq.bits.substep := substepCompute @@ -518,12 +539,12 @@ class TensorCoreDecoupled( (step << 1/*2 substeps*/) + substep } + val warpWriteback = tagQueue.io.deq.bits.warp val setWriteback = tagQueue.io.deq.bits.set val stepWriteback = tagQueue.io.deq.bits.step val substepWriteback = tagQueue.io.deq.bits.substep io.writeback.valid := dpuValid - // TODO: decouple wid from frontend - io.writeback.bits.wid := warpReg + io.writeback.bits.wid := warpWriteback io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback) io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) && (substepWriteback === 1.U) @@ -685,7 +706,7 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { ramA.node := stutter := xbar.node ramB.node := xbar.node - val fuzz = true + val fuzz = false lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { From 2a8c488d282ebc118bf1476c597dd6e8640d100a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 23:10:11 -0700 Subject: [PATCH 47/47] tensor: Reassert initiate.ready as soon as access ready --- .../radiance/core/TensorCoreDecoupled.scala | 116 +++++++----------- 1 file changed, 44 insertions(+), 72 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index ae763c6..c42dc29 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -69,6 +69,11 @@ class TensorCoreDecoupled( val source = UInt(sourceWidth.W) val data = UInt(dataWidth.W) } + class TensorMemTag extends Bundle { + val warp = UInt(numWarpBits.W) + val set = UInt(setBits.W) + val index = UInt(indexBits.W) + } // mem response after translation from TL source to set/step tag class TensorMemRespWithTag( dataWidth: Int @@ -77,15 +82,11 @@ class TensorCoreDecoupled( val data = UInt(dataWidth.W) } - // FSM - // --- - // This drives the overall pipeline of memory requests, dot-product unit - // operations and regfile writeback. - - val busy = RegInit(false.B) - // Holds the warp id the core is currently working on. Note that we only - // support one outstanding warp request - val warpAccess = RegInit(0.U(numWarpBits.W)) + // =========================================================================== + // Access stage + // =========================================================================== + // + // Frontend of the decoupled access/execute pipeline. // sets: k iteration val numSets = (tilingParams.k / tilingParams.kc) @@ -97,39 +98,15 @@ class TensorCoreDecoupled( val lastStep = ((1 << stepBits) - 1) def setDone(set: UInt) = (set === lastSet.U) def stepDone(step: UInt) = (step === lastStep.U) + // 'index' is the index of a memory request among the sequence of requests + // needed to read a full M-column of A or N-row of B. Its range is [0,m/2) + // or [0,n/2), where 2 is the stride can be read in a single request size. + require(tilingParams.m == tilingParams.n, + "currently only supports square SMEM tile") + val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/ + val indexBits = log2Ceil(numIndices) + val lastIndex = (1 << indexBits) - 1 - when (io.initiate.fire) { - val wid = io.initiate.bits.wid - busy := true.B - warpAccess := wid - when(io.writeback.fire) { - assert( - io.writeback.bits.wid =/= wid, - "unsupported concurrent initiate and writeback to the same warp" - ) - } - } - - // TODO: @perf: Instead of waiting until the last writeback, release busy as - // soon as the access frontend is complete so that there's a better chance to - // saturate the backend with back-to-back HGMMAs. This would require sending - // the 'wid' register to backend instead of having it shared with the - // frontend. - when(io.writeback.fire && io.writeback.bits.last) { - busy := false.B - } - - // serialize every HGMMA request - io.initiate.ready := !busy - - // =========================================================================== - // Access stage - // =========================================================================== - // - // Frontend of the decoupled access/execute pipeline. - - // States - // object AccessorState extends ChiselEnum { val idle = Value(0.U) val access = Value(1.U) @@ -142,6 +119,30 @@ class TensorCoreDecoupled( val allReqsDone = WireInit(false.B) dontTouch(allReqsDone) + val warpAccess = RegInit(0.U(numWarpBits.W)) + + class BlockState extends Bundle { + val set = UInt(setBits.W) + val index = UInt(indexBits.W) + } + val stateInit = Wire(new BlockState) + stateInit.set := 0.U + stateInit.index := 0.U + val stateA = RegInit(stateInit) + val stateB = RegInit(stateInit) + dontTouch(stateA) + dontTouch(stateA.index) + dontTouch(stateB) + dontTouch(stateB.index) + + io.initiate.ready := (state === AccessorState.idle) + when (io.initiate.fire) { + warpAccess := io.initiate.bits.wid + assert(stateA.set === 0.U && stateA.index === 0.U && + stateB.set === 0.U && stateB.index === 0.U, + "stateA and stateB not initialized to zero") + } + switch(state) { is(AccessorState.idle) { when(io.initiate.fire) { @@ -154,40 +155,11 @@ class TensorCoreDecoupled( } } is(AccessorState.finish) { - // FIXME: decouple writeback - when(io.writeback.fire) { - state := AccessorState.idle - } + // FIXME: is finish state needed? + state := AccessorState.idle } } - // 'index' is the index of a memory request among the sequence of requests - // needed to read a full M-column of A or N-row of B. Its range is [0,m/2) - // or [0,n/2), where 2 is the stride can be read in a single request size. - require(tilingParams.m == tilingParams.n, - "currently only supports square SMEM tile") - val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/ - val indexBits = log2Ceil(numIndices) - val lastIndex = (1 << indexBits) - 1 - - class State extends Bundle { - val set = UInt(setBits.W) - val index = UInt(indexBits.W) - } - class TensorMemTag extends Bundle { - val warp = UInt(numWarpBits.W) - val set = UInt(setBits.W) - val index = UInt(indexBits.W) - } - - val stateInit = Wire(new State) - stateInit.set := 0.U - stateInit.index := 0.U - val stateA = RegInit(stateInit) - val stateB = RegInit(stateInit) - dontTouch(stateA) - dontTouch(stateB) - when (io.reqA.fire) { when (stateA.index === lastIndex.U) { stateA.set := stateA.set + 1.U