From ab8d3554bb134fd96e622cd3c0a13406adbdff34 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 15:45:52 -0700 Subject: [PATCH 01/27] Bump vortex to tensor-decoupled --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index da54162..4dcbc31 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit da54162241da020807274bd4087844d379d8170e +Subproject commit 4dcbc31a88915fff35ccefd00c6e753fa5ef135a From 2ca2ee37b0fffeb7225940b03c206f3237f10b85 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 15:45:59 -0700 Subject: [PATCH 02/27] tensor: Fix writeback datawidth --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 617659d..65246f6 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -42,7 +42,7 @@ class TensorCoreDecoupled( val writeback = Decoupled(new Bundle { val last = Bool() val wid = UInt(numWarpBits.W) - val data = Vec(numLanes, UInt(wordSize.W)) + val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W)) }) val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) @@ -135,7 +135,7 @@ class TensorCoreDecoupled( // Execute stage // ------------- - // Execute backend of the decoupled access/execute pipeline. + // Backend of the decoupled access/execute pipeline. // val respQueueDepth = 4 // FIXME: parameterize val respQueueA = Queue(io.respA, respQueueDepth) @@ -144,7 +144,7 @@ class TensorCoreDecoupled( respQueueB.ready := io.writeback.ready // FIXME require(respQueueA.bits.data.widthOption.get == - io.writeback.bits.data.widthOption.get * numLanes, + io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") // FIXME: debug dummy: pipe A directly to writeback From de393115cd97f812ceb91ef94598ca8d46570202 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 16:48:39 -0700 Subject: [PATCH 03/27] tensor: Translate TL response source to set/step tag --- .../radiance/core/TensorCoreDecoupled.scala | 79 +++++++++++++------ 1 file changed, 53 insertions(+), 26 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 65246f6..43dc1ca 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -32,8 +32,8 @@ class TensorCoreDecoupled( ) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 - val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 val sourceWidth = log2Ceil(numSourceIds) + val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 val io = IO(new Bundle { val initiate = Flipped(Decoupled(new Bundle { @@ -51,6 +51,27 @@ class TensorCoreDecoupled( }) dontTouch(io) + class TensorMemReq( + sourceWidth: Int + ) extends Bundle { + val source = UInt(sourceWidth.W) + val address = UInt(32.W) + } + class TensorMemResp( + sourceWidth: Int, + dataWidth: Int + ) extends Bundle { + val source = UInt(sourceWidth.W) + val data = UInt(dataWidth.W) + } + // mem response after translation from TL source to set/step tag + class TensorMemRespWithTag( + dataWidth: Int + ) extends Bundle { + val tag = new TensorMemTag + val data = UInt(dataWidth.W) + } + // FSM // --- // This drives the overall pipeline of memory requests, dot-product unit @@ -101,18 +122,39 @@ class TensorCoreDecoupled( // val genReq = (state === TensorState.run) - Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach { - case (req, resp) => { - val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds))) + class TensorMemTag extends Bundle { + val set = UInt(setBits.W) + val step = UInt(stepBits.W) + } + // use concatenation of set/step as the memory request source. This will get + // translated to the actual TL sourcewidth in sourceGen. + val tag = Wire(new TensorMemTag) + tag.set := set + tag.step := step + + val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) + val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) + Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach { + case (req, (resp, respTagged)) => { + val sourceGen = Module(new SourceGenerator( + log2Ceil(numSourceIds), + metadata = Some(tag) + )) sourceGen.io.gen := req.fire - sourceGen.io.meta := DontCare + sourceGen.io.meta := tag req.valid := genReq req.bits.address := 0.U // FIXME req.bits.source := sourceGen.io.id.bits sourceGen.io.reclaim.valid := resp.fire sourceGen.io.reclaim.bits := resp.bits.source + + // translate source + respTagged.valid := resp.valid + respTagged.bits.tag := sourceGen.io.peek + respTagged.bits.data := resp.bits.data + resp.ready := respTagged.ready } } @@ -130,16 +172,13 @@ class TensorCoreDecoupled( firedABReg := Seq(false.B, false.B) } - io.respA.ready := true.B // FIXME - io.respB.ready := true.B // FIXME - // Execute stage // ------------- // Backend of the decoupled access/execute pipeline. // val respQueueDepth = 4 // FIXME: parameterize - val respQueueA = Queue(io.respA, respQueueDepth) - val respQueueB = Queue(io.respB, respQueueDepth) + val respQueueA = Queue(respATagged, respQueueDepth) + val respQueueB = Queue(respBTagged, respQueueDepth) respQueueA.ready := io.writeback.ready // FIXME respQueueB.ready := io.writeback.ready // FIXME @@ -149,9 +188,11 @@ class TensorCoreDecoupled( // FIXME: debug dummy: pipe A directly to writeback io.writeback.valid := respQueueA.valid - val groupedRespA = respQueueA.bits.data.asBools.grouped(wordSize * 8/*bits*/) + val groupedRespA = respQueueA.bits.data + .asBools.grouped(wordSize * 8/*bits*/) + .map(VecInit(_).asUInt) (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) => - wb := VecInit(data).asUInt + wb := data } // State transition @@ -204,20 +245,6 @@ class TensorCoreDecoupled( // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) } -class TensorMemReq( - sourceWidth: Int -) extends Bundle { - val source = UInt(sourceWidth.W) - val address = UInt(32.W) -} -class TensorMemResp( - sourceWidth: Int, - dataWidth: Int -) extends Bundle { - val source = UInt(sourceWidth.W) - val data = UInt(dataWidth.W) -} - // synthesizable unit tests // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy From efaf599fbe679f0e5e7ef671522408f34984057e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 17:08:14 -0700 Subject: [PATCH 04/27] tensor: Assert alignment of A and B response queues --- .../radiance/core/TensorCoreDecoupled.scala | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 43dc1ca..4f5ecb3 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -97,15 +97,16 @@ class TensorCoreDecoupled( // steps: i-j iteration val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc) val stepBits = log2Ceil(numSteps) - val set = RegInit(0.U(setBits.W)) - val step = RegInit(0.U(stepBits.W)) + // set and step being currently accessed in the acc/ex frontend + val setAccess = RegInit(0.U(setBits.W)) + val stepAccess = RegInit(0.U(stepBits.W)) when(io.initiate.fire) { val wid = io.initiate.bits.wid busy := true.B warpReg := wid - set := 0.U - step := 0.U + setAccess := 0.U + stepAccess := 0.U when(io.writeback.fire) { assert( io.writeback.bits.wid =/= wid, @@ -129,8 +130,8 @@ class TensorCoreDecoupled( // use concatenation of set/step as the memory request source. This will get // translated to the actual TL sourcewidth in sourceGen. val tag = Wire(new TensorMemTag) - tag.set := set - tag.step := step + tag.set := setAccess + tag.step := stepAccess val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) @@ -176,16 +177,32 @@ class TensorCoreDecoupled( // ------------- // Backend of the decoupled access/execute pipeline. // + // set and step being currently executed in the acc/ex backend + val setExecute = RegInit(0.U(setBits.W)) + val stepExecute = RegInit(0.U(stepBits.W)) + val respQueueDepth = 4 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) - respQueueA.ready := io.writeback.ready // FIXME - respQueueB.ready := io.writeback.ready // FIXME require(respQueueA.bits.data.widthOption.get == io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") + val bothQueueValid = (respQueueA.valid && respQueueB.valid) + // assume in-order response and that A/B responses are always aligned; this + // might be too strong an assumption depending on the backing memory + when (bothQueueValid) { + assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) && + (respQueueA.bits.tag.step === respQueueB.bits.tag.step), + "A and B response queue pointing to different set/steps. " ++ + "This might indicate memory response coming back out-of-order.") + } + // synchronized dequeue + val deqResp = bothQueueValid && io.writeback.ready + respQueueA.ready := deqResp + respQueueB.ready := deqResp + // FIXME: debug dummy: pipe A directly to writeback io.writeback.valid := respQueueA.valid val groupedRespA = respQueueA.bits.data @@ -201,12 +218,12 @@ class TensorCoreDecoupled( // set/step sequencing logic val lastSet = ((1 << setBits) - 1) val lastStep = ((1 << stepBits) - 1) - val setDone = (set === lastSet.U) - val stepDone = (step === lastStep.U) + val setDone = (setAccess === lastSet.U) + val stepDone = (stepAccess === lastStep.U) when (nextStep) { - step := (step + 1.U) & lastStep.U + stepAccess := (stepAccess + 1.U) & lastStep.U when (stepDone) { - set := (set + 1.U) & lastSet.U + setAccess := (setAccess + 1.U) & lastSet.U } } From e2abe1cffdc3a658b0acc5b2cb36a82d5a3450ec Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 15 Oct 2024 19:12:15 -0700 Subject: [PATCH 05/27] tensor: Sequence set/steps in the execute-side --- .../radiance/core/TensorCoreDecoupled.scala | 52 ++++++++++++------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 4f5ecb3..7c07564 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -97,9 +97,16 @@ class TensorCoreDecoupled( // steps: i-j iteration val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc) val stepBits = log2Ceil(numSteps) + val lastSet = ((1 << setBits) - 1) + val lastStep = ((1 << stepBits) - 1) + def setDone(set: UInt) = (set === lastSet.U) + def stepDone(step: UInt) = (step === lastStep.U) + // set and step being currently accessed in the acc/ex frontend val setAccess = RegInit(0.U(setBits.W)) val stepAccess = RegInit(0.U(stepBits.W)) + dontTouch(setAccess) + dontTouch(stepAccess) when(io.initiate.fire) { val wid = io.initiate.bits.wid @@ -118,6 +125,9 @@ class TensorCoreDecoupled( busy := false.B } + // serialize every HGMMA request + io.initiate.ready := !busy + // Memory traffic generation // ------------------------- // @@ -166,10 +176,10 @@ class TensorCoreDecoupled( req.fire }) val firedAB = (firedABNow.asUInt | firedABReg.asUInt) - val nextStep = firedAB.andR + val nextStepAccess = firedAB.andR // clear out firedABReg every step. this will overwrite the previous fired // write upon the last fire out of A and B - when (nextStep) { + when (nextStepAccess) { firedABReg := Seq(false.B, false.B) } @@ -180,6 +190,8 @@ class TensorCoreDecoupled( // set and step being currently executed in the acc/ex backend val setExecute = RegInit(0.U(setBits.W)) val stepExecute = RegInit(0.U(stepBits.W)) + dontTouch(setExecute) + dontTouch(stepExecute) val respQueueDepth = 4 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) @@ -198,13 +210,19 @@ class TensorCoreDecoupled( "A and B response queue pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } - // synchronized dequeue + // dequeue is synchronized between A and B + // FIXME: this need to change to dpu_ready val deqResp = bothQueueValid && io.writeback.ready respQueueA.ready := deqResp respQueueB.ready := deqResp + // FIXME: this need to change to dpu_fire + val nextStepExecute = io.writeback.fire + + io.writeback.valid := bothQueueValid + io.writeback.bits.wid := warpReg + io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) // FIXME: debug dummy: pipe A directly to writeback - io.writeback.valid := respQueueA.valid val groupedRespA = respQueueA.bits.data .asBools.grouped(wordSize * 8/*bits*/) .map(VecInit(_).asUInt) @@ -216,16 +234,17 @@ class TensorCoreDecoupled( // ---------------- // // set/step sequencing logic - val lastSet = ((1 << setBits) - 1) - val lastStep = ((1 << stepBits) - 1) - val setDone = (setAccess === lastSet.U) - val stepDone = (stepAccess === lastStep.U) - when (nextStep) { - stepAccess := (stepAccess + 1.U) & lastStep.U - when (stepDone) { - setAccess := (setAccess + 1.U) & lastSet.U + + def sequenceSetStep(set: UInt, step: UInt, nextStep: Bool) = { + when (nextStep) { + step := (step + 1.U) & lastStep.U + when (stepDone(step)) { + set := (set + 1.U) & lastSet.U + } } } + sequenceSetStep(setAccess, stepAccess, nextStepAccess) + sequenceSetStep(setExecute, stepExecute, nextStepExecute) switch(state) { is(TensorState.idle) { @@ -234,7 +253,7 @@ class TensorCoreDecoupled( } } is(TensorState.run) { - when (setDone && stepDone && nextStep) { + when (setDone(setAccess) && stepDone(stepAccess) && nextStepAccess) { when (state === TensorState.run) { state := TensorState.finish } @@ -247,11 +266,6 @@ class TensorCoreDecoupled( } } - io.initiate.ready := !busy - io.writeback.valid := (state === TensorState.finish) - io.writeback.bits.wid := warpReg - io.writeback.bits.last := false.B // TODO - // Writeback queues // ---------------- // These queues hold the metadata necessary for register @@ -328,7 +342,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) tensor.io.initiate.bits.wid := 0.U // FIXME tensor.io.writeback.ready := true.B - io.finished := tensor.io.writeback.valid + io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last } // a minimal Diplomacy graph with a tensor core and a TLRAM From 444dd5d7e1c54ab78111fdcfac9ebf3145809f02 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 14:25:38 -0700 Subject: [PATCH 06/27] tensor: Add destination reg to IO --- .../scala/radiance/core/TensorCoreDecoupled.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 7c07564..92f98b7 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -28,12 +28,14 @@ class TensorCoreDecoupled( val numWarps: Int, val numLanes: Int, val numSourceIds: Int, - val tilingParams: TensorTilingParams + val tilingParams: TensorTilingParams, + val numFPRegs: Int = 32 ) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 val sourceWidth = log2Ceil(numSourceIds) val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 + val numFPRegBits = log2Ceil(numFPRegs) val io = IO(new Bundle { val initiate = Flipped(Decoupled(new Bundle { @@ -42,6 +44,7 @@ class TensorCoreDecoupled( val writeback = Decoupled(new Bundle { val last = Bool() val wid = UInt(numWarpBits.W) + val rd = UInt(numFPRegBits.W) val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W)) }) val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) @@ -218,8 +221,17 @@ class TensorCoreDecoupled( // FIXME: this need to change to dpu_fire val nextStepExecute = io.writeback.fire + def rdGen(set: UInt, step: UInt): UInt = { + // each step produces 4x4 output tile, written by 8 threads with 2 regs per + // thread + require(numLanes == 8, "currently assumes 8-wide warps") + (Cat(set, step) >> 1/*2 regs/thread*/) + // FIXME: add substep here + } + io.writeback.valid := bothQueueValid io.writeback.bits.wid := warpReg + io.writeback.bits.rd := rdGen(setExecute, stepExecute) io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) // FIXME: debug dummy: pipe A directly to writeback From 77dae3e1f9941d15c213b19a43cd82bd0e00c81c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 21:21:48 -0700 Subject: [PATCH 07/27] tensor: Write staging pipeline for A tile --- .../radiance/core/TensorCoreDecoupled.scala | 103 ++++++++++++++---- src/main/scala/radiance/core/TensorDPU.scala | 1 + 2 files changed, 83 insertions(+), 21 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 92f98b7..69b84f9 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -108,8 +108,12 @@ class TensorCoreDecoupled( // set and step being currently accessed in the acc/ex frontend val setAccess = RegInit(0.U(setBits.W)) val stepAccess = RegInit(0.U(stepBits.W)) + // we need full 4x4 A tile to fire DPU, but since the memory width is 8 + // words, we need 2 cycles to read A. `substep` tells which cycle we're at. + val substepAccess = RegInit(0.U(1.W)) dontTouch(setAccess) dontTouch(stepAccess) + dontTouch(substepAccess) when(io.initiate.fire) { val wid = io.initiate.bits.wid @@ -139,16 +143,19 @@ class TensorCoreDecoupled( class TensorMemTag extends Bundle { val set = UInt(setBits.W) val step = UInt(stepBits.W) + val substep = UInt(1.W) } // use concatenation of set/step as the memory request source. This will get // translated to the actual TL sourcewidth in sourceGen. val tag = Wire(new TensorMemTag) tag.set := setAccess tag.step := stepAccess + tag.substep := substepAccess val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) - Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach { + Seq((io.reqA, (io.respA, respATagged)), + (io.reqB, (io.respB, respBTagged))).foreach { case (req, (resp, respTagged)) => { val sourceGen = Module(new SourceGenerator( log2Ceil(numSourceIds), @@ -173,18 +180,22 @@ class TensorCoreDecoupled( } // only advance to the next step if we fired mem requests for both A and B + // TODO: @perf: too strict? should be able to have A and B progress + // separately val firedABReg = RegInit(VecInit(false.B, false.B)) val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map { case (req, fired) => { when (req.fire) { fired := true.B } } req.fire }) val firedAB = (firedABNow.asUInt | firedABReg.asUInt) - val nextStepAccess = firedAB.andR - // clear out firedABReg every step. this will overwrite the previous fired - // write upon the last fire out of A and B - when (nextStepAccess) { + val nextSubstepAccess = firedAB.andR + val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U) + // clear out firedABReg every substep + when (nextSubstepAccess) { firedABReg := Seq(false.B, false.B) + substepAccess := substepAccess + 1.U } + require(substepAccess.widthOption.get == 1, "there should be only two substeps") // Execute stage // ------------- @@ -204,22 +215,72 @@ class TensorCoreDecoupled( io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") - val bothQueueValid = (respQueueA.valid && respQueueB.valid) - // assume in-order response and that A/B responses are always aligned; this - // might be too strong an assumption depending on the backing memory - when (bothQueueValid) { - assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) && - (respQueueA.bits.tag.step === respQueueB.bits.tag.step), - "A and B response queue pointing to different set/steps. " ++ - "This might indicate memory response coming back out-of-order.") - } - // dequeue is synchronized between A and B // FIXME: this need to change to dpu_ready - val deqResp = bothQueueValid && io.writeback.ready - respQueueA.ready := deqResp - respQueueB.ready := deqResp - // FIXME: this need to change to dpu_fire - val nextStepExecute = io.writeback.fire + val dpuReady = io.writeback.ready // FIXME: this need be actual dpu + + val substepExecute = RegInit(0.U(1.W)) + when (respQueueA.fire) { + substepExecute := substepExecute + 1.U + } + dontTouch(substepExecute) + + // note combinationally coupled ready with `pipe` + val halfAQueue = Module(new Queue( + chiselTypeOf(respQueueA.bits.data), entries = 1, pipe = true + )) + halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U) + halfAQueue.io.enq.bits := respQueueA.bits.data + + // we need the full data for A because we divide the D tile by half along N; + // for B, the DPU can immediately start computing with a 4x2 tile. + // + // substep == 0 data goes to the LSB + val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits) + val fullAQueue = Module(new Queue( + chiselTypeOf(fullAEnqData), entries = 1, pipe = true + )) + // hold first half A data for the first substep + halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) && + fullAQueue.io.enq.ready + + require(fullAEnqData.widthOption.get == dataWidth * 2, + "assumes 2-cycle read for a full compute tile of A") + fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) && + halfAQueue.io.deq.valid + fullAQueue.io.enq.bits := fullAEnqData + + val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME? + val dpuFire = operandsValid && dpuReady + fullAQueue.io.deq.ready := dpuFire + val nextStepExecute = dpuFire + + // FIXME: need to hold A for two cycles!! + + // make sure to dequeue from response queues only when both A and B valid + respQueueA.ready := MuxCase(false.B, + Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready, + (substepExecute === 1.U) -> fullAQueue.io.enq.ready)) + respQueueB.ready := dpuFire + dontTouch(respQueueA) + dontTouch(respQueueB) + + // assert that the A and B response queue heads always point to the same + // set/step/substep + // + // this assumes that memory responses come back in-order. this might be too + // strong an assumption depending on the backing memory + def assertAligned = { + val bothQueueValid = (respQueueA.valid && respQueueB.valid) + when (bothQueueValid && (substepExecute === 0.U)) { + assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) && + (respQueueA.bits.tag.step === respQueueB.bits.tag.step), + "A and B response queue pointing to different set/steps. " ++ + "This might indicate memory response coming back out-of-order.") + } + dontTouch(respQueueA.bits.tag) + dontTouch(respQueueB.bits.tag) + } + assertAligned def rdGen(set: UInt, step: UInt): UInt = { // each step produces 4x4 output tile, written by 8 threads with 2 regs per @@ -229,7 +290,7 @@ class TensorCoreDecoupled( // FIXME: add substep here } - io.writeback.valid := bothQueueValid + io.writeback.valid := operandsValid // FIXME: bypass logic io.writeback.bits.wid := warpReg io.writeback.bits.rd := rdGen(setExecute, stepExecute) io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala index 4e6cee7..a82bed7 100644 --- a/src/main/scala/radiance/core/TensorDPU.scala +++ b/src/main/scala/radiance/core/TensorDPU.scala @@ -27,6 +27,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar val b = Vec(dotProductDim, Bits((inFLen).W)) val c = Bits((outFLen).W) // note C has the out length for accumulation })) + // 'stall' is effectively out.ready, combinationally coupled to in.ready val stall = Input(Bool()) val out = Valid(new Bundle { val data = Bits((outFLen).W) From 6cad8edd1838642cbbb61ef6998c8318d96864e1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 22:01:02 -0700 Subject: [PATCH 08/27] tensor: Fix operand alignment in pipelining --- .../radiance/core/TensorCoreDecoupled.scala | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 69b84f9..0654df3 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -224,37 +224,51 @@ class TensorCoreDecoupled( } dontTouch(substepExecute) + // Do pipelining for the A operand so that we obtain the full 4x4 A tile + // ready for compute. The pipeline is two-stage: + // - stage one (halfAQueue) for assembling the full A tile from half-tiles + // coming from the resp queue, and + // - stage two (fullAQueue) for holding the full A tile until it gets + // matched with two 4x2 B tiles, and compute is complete. + // + // Note that the half-tile assembly is unnecessary for B since the B tile is + // only 4x2. + // Also send the set/step tag along the pipe for alignment check. + // note combinationally coupled ready with `pipe` val halfAQueue = Module(new Queue( - chiselTypeOf(respQueueA.bits.data), entries = 1, pipe = true + chiselTypeOf(respQueueA.bits), entries = 1, pipe = true )) halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U) - halfAQueue.io.enq.bits := respQueueA.bits.data + halfAQueue.io.enq.bits := respQueueA.bits - // we need the full data for A because we divide the D tile by half along N; - // for B, the DPU can immediately start computing with a 4x2 tile. - // // substep == 0 data goes to the LSB - val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits) + val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits.data) + require(fullAEnqData.widthOption.get == dataWidth * 2, + "assumes 2-cycle read for a full compute tile of A") + // only use the lower halfA's tag. substep will be incorrect. + val fullAEnqTag = halfAQueue.io.deq.bits.tag val fullAQueue = Module(new Queue( - chiselTypeOf(fullAEnqData), entries = 1, pipe = true + new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true )) // hold first half A data for the first substep halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) && fullAQueue.io.enq.ready - - require(fullAEnqData.widthOption.get == dataWidth * 2, - "assumes 2-cycle read for a full compute tile of A") fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) && halfAQueue.io.deq.valid - fullAQueue.io.enq.bits := fullAEnqData + fullAQueue.io.enq.bits.data := fullAEnqData + fullAQueue.io.enq.bits.tag := fullAEnqTag val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME? val dpuFire = operandsValid && dpuReady - fullAQueue.io.deq.ready := dpuFire - val nextStepExecute = dpuFire + val substepCompute = RegInit(0.U(1.W)) + when (dpuFire) { + substepCompute := substepCompute + 1.U + } - // FIXME: need to hold A for two cycles!! + // hold full A until two-cycle compute is done + fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) + val nextStepExecute = dpuFire && (substepCompute === 1.U) // make sure to dequeue from response queues only when both A and B valid respQueueA.ready := MuxCase(false.B, @@ -264,21 +278,17 @@ class TensorCoreDecoupled( dontTouch(respQueueA) dontTouch(respQueueB) - // assert that the A and B response queue heads always point to the same - // set/step/substep + // assert that the DPU is computing with operands of the same set/step // // this assumes that memory responses come back in-order. this might be too // strong an assumption depending on the backing memory def assertAligned = { - val bothQueueValid = (respQueueA.valid && respQueueB.valid) - when (bothQueueValid && (substepExecute === 0.U)) { - assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) && - (respQueueA.bits.tag.step === respQueueB.bits.tag.step), - "A and B response queue pointing to different set/steps. " ++ + when (dpuFire) { + assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) && + (fullAQueue.io.deq.bits.tag.step === respQueueB.bits.tag.step), + "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } - dontTouch(respQueueA.bits.tag) - dontTouch(respQueueB.bits.tag) } assertAligned From 23edc34c7ebc28623a5961abc654d7f4049c4864 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 22:15:35 -0700 Subject: [PATCH 09/27] tensor: Add two TLRAM config for full throughput test --- .../radiance/core/TensorCoreDecoupled.scala | 38 ++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 0654df3..154a3cf 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -155,8 +155,8 @@ class TensorCoreDecoupled( val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), - (io.reqB, (io.respB, respBTagged))).foreach { - case (req, (resp, respTagged)) => { + (io.reqB, (io.respB, respBTagged))).zipWithIndex.foreach { + case ((req, (resp, respTagged)), i) => { val sourceGen = Module(new SourceGenerator( log2Ceil(numSourceIds), metadata = Some(tag) @@ -165,7 +165,9 @@ class TensorCoreDecoupled( sourceGen.io.gen := req.fire sourceGen.io.meta := tag req.valid := genReq - req.bits.address := 0.U // FIXME + // FIXME: bogus address + // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B + req.bits.address := 0.U req.bits.source := sourceGen.io.id.bits sourceGen.io.reclaim.valid := resp.fire @@ -270,7 +272,8 @@ class TensorCoreDecoupled( fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) val nextStepExecute = dpuFire && (substepCompute === 1.U) - // make sure to dequeue from response queues only when both A and B valid + // respQueueA output arbitrates to either halfAQueue or fullAQueue depending + // on the substep respQueueA.ready := MuxCase(false.B, Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready, (substepExecute === 1.U) -> fullAQueue.io.enq.ready)) @@ -446,10 +449,35 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule { } } +// two separate TLRAMs for A and B for full throughput +class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { + val tensor = LazyModule(new TensorCoreDecoupledTL) + val xbar = LazyModule(new TLXbar) + val ramA = LazyModule(new TLRAM( + address = AddressSet(0x000, 0xfffeff), + beatBytes = 32 // FIXME: hardcoded + )) + val ramB = LazyModule(new TLRAM( + address = AddressSet(0x100, 0xfffeff), + beatBytes = 32 // FIXME: hardcoded + )) + + xbar.node :=* tensor.node + ramA.node := xbar.node + ramB.node := xbar.node + + lazy val module = new Impl + class Impl extends LazyModuleImp(this) with UnitTestModule { + tensor.module.io.start := io.start + io.finished := tensor.module.io.finished + } +} + // unit test harness class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module) + // val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module) + val dut = Module(LazyModule(new TensorCoreDecoupledTwoTLRAM).module) dut.io.start := io.start io.finished := dut.io.finished } From e1e3ac8274bd02954ff4d64ad9462ef4a8bb2f1b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 16 Oct 2024 22:22:27 -0700 Subject: [PATCH 10/27] tensor: Fix busy state --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 154a3cf..652608b 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -128,7 +128,13 @@ class TensorCoreDecoupled( ) } } - when(io.writeback.fire) { + + // TODO: @perf: Instead of waiting until the last writeback, release busy as + // soon as the access frontend is complete so that there's a better chance to + // saturate the backend with back-to-back HGMMAs. This would require sending + // the 'wid' register to backend instead of having it shared with the + // frontend. + when(io.writeback.fire && io.writeback.bits.last) { busy := false.B } From 8847278ad1d54fe3167f01e0b9f70fcd3dd01096 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 17 Oct 2024 14:37:33 -0700 Subject: [PATCH 11/27] tensor: Instantiate actual DPU --- .../radiance/core/TensorCoreDecoupled.scala | 93 +++++++++++++++---- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 652608b..b9695ad 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -33,8 +33,9 @@ class TensorCoreDecoupled( ) extends Module { val numWarpBits = log2Ceil(numWarps) val wordSize = 4 // TODO FP16 + val wordSizeInBits = wordSize * 8 // TODO FP16 val sourceWidth = log2Ceil(numSourceIds) - val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16 + val dataWidth = numLanes * wordSizeInBits // TODO FP16 val numFPRegBits = log2Ceil(numFPRegs) val io = IO(new Bundle { @@ -45,7 +46,7 @@ class TensorCoreDecoupled( val last = Bool() val wid = UInt(numWarpBits.W) val rd = UInt(numFPRegBits.W) - val data = Vec(numLanes, UInt((wordSize * 8/*bits*/).W)) + val data = Vec(numLanes, UInt((wordSizeInBits).W)) }) val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth))) @@ -223,9 +224,6 @@ class TensorCoreDecoupled( io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") - // FIXME: this need to change to dpu_ready - val dpuReady = io.writeback.ready // FIXME: this need be actual dpu - val substepExecute = RegInit(0.U(1.W)) when (respQueueA.fire) { substepExecute := substepExecute + 1.U @@ -267,7 +265,10 @@ class TensorCoreDecoupled( fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag - val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid // FIXME? + val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid + val operandA = fullAQueue.io.deq.bits.data + val operandB = respQueueB.bits.data + val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val substepCompute = RegInit(0.U(1.W)) when (dpuFire) { @@ -301,6 +302,66 @@ class TensorCoreDecoupled( } assertAligned + // Dot-product unit + // + // 4x2 four-element DPUs summing up to 32 MACs in total + val dpus = Seq.fill(4)(Seq.fill(2)( + Module(new TensorDotProductUnit(half = false)) + )) + // operandA is 4x4 in K-major + val operandADimensional = + operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + .grouped(4).toSeq + println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits") + println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}") + assert(operandADimensional.length == tilingParams.mc && + operandADimensional(0).length == tilingParams.kc, + "operand width doesn't agree with tiling parameter") + // operandB is 2x4, i.e. 4x2 in N-major + val operandBDimensional = + operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + .grouped(4).toSeq + println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}") + val ncSubstep = tilingParams.nc / 2 + assert(tilingParams.mc * ncSubstep == numLanes, + "substep tile size doesn't match writeback throughput") + assert(operandBDimensional.length == ncSubstep && + operandBDimensional(0).length == tilingParams.kc, + "operand width doesn't agree with tiling parameter") + + for (m <- 0 until tilingParams.mc) { + for (n <- 0 until ncSubstep) { + dpus(m)(n).io.in.valid := dpuFire + dpus(m)(n).io.in.bits.a := operandADimensional(m) + dpus(m)(n).io.in.bits.b := operandBDimensional(n) + dpus(m)(n).io.in.bits.c := 0.U // FIXME: bogus accum data + // dpu ready couples with writeback backpressure + dpus(m)(n).io.stall := !io.writeback.ready + } + } + dpuReady := !dpus(0)(0).io.stall + dontTouch(dpuFire) + dontTouch(dpuReady) + + val dpuValids = dpus.flatMap(_.map(_.io.out.valid)) + val dpuValid = dpuValids.reduce(_ && _) + def assertDPU = { + val dpuStalls = dpus.flatMap(_.map(_.io.stall)) + assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _), + "stall signals of DPUs went unaligned") + assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _), + "valid signals of DPUs went unaligned") + } + assertDPU + + // flatten DPU output into 1D array in M-major order + val flattenedDPUOut = (0 until ncSubstep).flatMap { n => + (0 until tilingParams.mc).map { m => + dpus(m)(n).io.out.bits.data + } + } + io.writeback.bits.data := flattenedDPUOut + def rdGen(set: UInt, step: UInt): UInt = { // each step produces 4x4 output tile, written by 8 threads with 2 regs per // thread @@ -309,19 +370,11 @@ class TensorCoreDecoupled( // FIXME: add substep here } - io.writeback.valid := operandsValid // FIXME: bypass logic + io.writeback.valid := dpuValid io.writeback.bits.wid := warpReg io.writeback.bits.rd := rdGen(setExecute, stepExecute) io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) - // FIXME: debug dummy: pipe A directly to writeback - val groupedRespA = respQueueA.bits.data - .asBools.grouped(wordSize * 8/*bits*/) - .map(VecInit(_).asUInt) - (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) => - wb := data - } - // State transition // ---------------- // @@ -400,7 +453,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) val tensor = Module(new TensorCoreDecoupled( 8, 8, outer.numSrcIds , TensorTilingParams())) - val wordSize = 4 // FIXME: hardcoded + val wordSize = 4 // @cleanup: hardcoded val zip = Seq((outer.node.out(0), tensor.io.reqA), (outer.node.out(1), tensor.io.reqB)) @@ -431,7 +484,7 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) tlOutB.d.ready := tensor.io.respB.ready tensor.io.initiate.valid := io.start - tensor.io.initiate.bits.wid := 0.U // FIXME + tensor.io.initiate.bits.wid := 0.U // TODO tensor.io.writeback.ready := true.B io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last @@ -443,7 +496,7 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule { val xbar = LazyModule(new TLXbar) val ram = LazyModule(new TLRAM( address = AddressSet(0x0000, 0xffffff), - beatBytes = 32 // FIXME: hardcoded + beatBytes = 32 // @cleanup: hardcoded )) ram.node :=* xbar.node :=* tensor.node @@ -461,11 +514,11 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { val xbar = LazyModule(new TLXbar) val ramA = LazyModule(new TLRAM( address = AddressSet(0x000, 0xfffeff), - beatBytes = 32 // FIXME: hardcoded + beatBytes = 32 // @cleanup: hardcoded )) val ramB = LazyModule(new TLRAM( address = AddressSet(0x100, 0xfffeff), - beatBytes = 32 // FIXME: hardcoded + beatBytes = 32 // @cleanup: hardcoded )) xbar.node :=* tensor.node From 7de8e86d4f04712f90c4457940c02a341b721f76 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 17 Oct 2024 15:18:47 -0700 Subject: [PATCH 12/27] tensor: Sync rd with DPU using a queue --- .../radiance/core/TensorCoreDecoupled.scala | 44 ++++++++++++------- src/main/scala/radiance/core/TensorDPU.scala | 2 +- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index b9695ad..92a6596 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -270,6 +270,8 @@ class TensorCoreDecoupled( val operandB = respQueueB.bits.data val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady + val setCompute = fullAQueue.io.deq.bits.tag.set + val stepCompute = fullAQueue.io.deq.bits.tag.step val substepCompute = RegInit(0.U(1.W)) when (dpuFire) { substepCompute := substepCompute + 1.U @@ -348,9 +350,9 @@ class TensorCoreDecoupled( def assertDPU = { val dpuStalls = dpus.flatMap(_.map(_.io.stall)) assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _), - "stall signals of DPUs went unaligned") + "stall signals of DPUs went out of sync") assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _), - "valid signals of DPUs went unaligned") + "valid signals of DPUs went out of sync") } assertDPU @@ -362,17 +364,36 @@ class TensorCoreDecoupled( } io.writeback.bits.data := flattenedDPUOut - def rdGen(set: UInt, step: UInt): UInt = { + // Writeback queues + // ---------------- + // These queues hold metadata needed for writeback in sync with the DPU. + + val queueDepth = 4 // needs to be at least the DPU latency + val rdQueue = Module(new Queue( + chiselTypeOf(io.writeback.bits.rd), queueDepth + )) + rdQueue.io.enq.valid := dpuFire + rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute) + rdQueue.io.deq.ready := io.writeback.fire + assert(rdQueue.io.enq.ready === true.B, + "rd queue full, throttling DPU operation") + assert(!dpuValid || rdQueue.io.deq.valid, + "rd queue and DPU went out of sync") + + // TODO: decouple wid from frontend + // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) + + // note rd is independent to sets + def rdGen(step: UInt, substep: UInt): UInt = { // each step produces 4x4 output tile, written by 8 threads with 2 regs per // thread - require(numLanes == 8, "currently assumes 8-wide warps") - (Cat(set, step) >> 1/*2 regs/thread*/) - // FIXME: add substep here + (step << 1/*2 substeps*/) + substep } io.writeback.valid := dpuValid io.writeback.bits.wid := warpReg - io.writeback.bits.rd := rdGen(setExecute, stepExecute) + io.writeback.bits.rd := rdQueue.io.deq.bits + // FIXME: look at set/step of dpu output not setExecute io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) // State transition @@ -410,15 +431,6 @@ class TensorCoreDecoupled( } } } - - // Writeback queues - // ---------------- - // These queues hold the metadata necessary for register - // writeback. - - // val queueDepth = 2 - // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) - // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) } // synthesizable unit tests diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala index a82bed7..515b1bf 100644 --- a/src/main/scala/radiance/core/TensorDPU.scala +++ b/src/main/scala/radiance/core/TensorDPU.scala @@ -53,7 +53,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar io.out.bits.data := ieee(box(dpu.io.out.bits.data, S)) } -// Copied from chisel3.util.Pipe. +// An implementation of chisel3.util.Pipe that supports stalls. class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module { /** A non-ambiguous name of this `StallingPipe` for use in generated Verilog * names. Includes the latency cycle count in the name as well as the From 2741af0b2b36026cfe57ca227eb469d6643d4c12 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 17 Oct 2024 15:43:44 -0700 Subject: [PATCH 13/27] tensor: Keep set/step in the tag writeback queue --- .../radiance/core/TensorCoreDecoupled.scala | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 92a6596..3d00c35 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -267,6 +267,7 @@ class TensorCoreDecoupled( val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid val operandA = fullAQueue.io.deq.bits.data + val operandATag = fullAQueue.io.deq.bits.tag val operandB = respQueueB.bits.data val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady @@ -314,8 +315,6 @@ class TensorCoreDecoupled( val operandADimensional = operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4).toSeq - println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits") - println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}") assert(operandADimensional.length == tilingParams.mc && operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") @@ -323,7 +322,6 @@ class TensorCoreDecoupled( val operandBDimensional = operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4).toSeq - println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}") val ncSubstep = tilingParams.nc / 2 assert(tilingParams.mc * ncSubstep == numLanes, "substep tile size doesn't match writeback throughput") @@ -369,18 +367,20 @@ class TensorCoreDecoupled( // These queues hold metadata needed for writeback in sync with the DPU. val queueDepth = 4 // needs to be at least the DPU latency - val rdQueue = Module(new Queue( - chiselTypeOf(io.writeback.bits.rd), queueDepth + val tagQueue = Module(new Queue( + chiselTypeOf(operandATag), queueDepth )) - rdQueue.io.enq.valid := dpuFire - rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute) - rdQueue.io.deq.ready := io.writeback.fire - assert(rdQueue.io.enq.ready === true.B, - "rd queue full, throttling DPU operation") - assert(!dpuValid || rdQueue.io.deq.valid, - "rd queue and DPU went out of sync") + tagQueue.io.enq.valid := dpuFire + // A and B should have the same tags + tagQueue.io.enq.bits := operandATag + // @cleanup: awkward + tagQueue.io.enq.bits.substep := substepCompute + tagQueue.io.deq.ready := io.writeback.fire + assert(tagQueue.io.enq.ready === true.B, + "tag queue full, DPU operation might be throttled") + assert(!dpuValid || tagQueue.io.deq.valid, + "tag queue and DPU went out of sync") - // TODO: decouple wid from frontend // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) // note rd is independent to sets @@ -390,11 +390,14 @@ class TensorCoreDecoupled( (step << 1/*2 substeps*/) + substep } + val setWriteback = tagQueue.io.deq.bits.set + val stepWriteback = tagQueue.io.deq.bits.step + val substepWriteback = tagQueue.io.deq.bits.substep io.writeback.valid := dpuValid + // TODO: decouple wid from frontend io.writeback.bits.wid := warpReg - io.writeback.bits.rd := rdQueue.io.deq.bits - // FIXME: look at set/step of dpu output not setExecute - io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) + io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback) + io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) // State transition // ---------------- @@ -500,6 +503,10 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) tensor.io.writeback.ready := true.B io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last + when (io.finished) { + // might be too strong + assert(tensor.io.writeback.bits.rd === 31.U) + } } // a minimal Diplomacy graph with a tensor core and a TLRAM From a2519da58fe1397a7570656a4726f42693e8d845 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 17 Oct 2024 16:36:18 -0700 Subject: [PATCH 14/27] tensor: SMEM address generation --- .../radiance/core/TensorCoreDecoupled.scala | 51 +++++++++++++++++-- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 3d00c35..f7c8547 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -159,6 +159,48 @@ class TensorCoreDecoupled( tag.step := stepAccess tag.substep := substepAccess + // @cleanup: generalize in terms of M/N/K-majorness? + def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt) + : (UInt/*A*/, UInt/*B*/) = { + // note that step iterates along N first, then M + val numComputeTilesM = tilingParams.m / tilingParams.mc + val numComputeTilesN = tilingParams.n / tilingParams.nc + val tileM = step % numComputeTilesM.U + val tileN = step / numComputeTilesM.U + val mcSubstep = tilingParams.mc / 2 + val ncSubstep = tilingParams.nc / 2 + + // note that both A and B are K-major to facilitate bank conflict-free SMEM + // accesses + // + // (row,col) coordinate of the compute tile + val tileRowA = tileM // M + val tileColA = set // K + val tileRowB = tileN // N + val tileColB = set // K + // (row,col) coordinate of the starting element of the compute tile + val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) + + (substep << log2Ceil(mcSubstep)) + val elemColA = tileColA << log2Ceil(tilingParams.kc) + val elemRowB = tileRowB << log2Ceil(tilingParams.nc) + (substep << log2Ceil(ncSubstep)) + val elemColB = tileColB << log2Ceil(tilingParams.kc) + val rowStrideA = wordSize * tilingParams.k + val rowStrideABits = log2Ceil(rowStrideA) + val rowStrideB = wordSize * tilingParams.k + val rowStrideBBits = log2Ceil(rowStrideB) + val wordStrideBits = log2Ceil(wordSize) + + val tileOffsetA = (elemRowA << rowStrideABits) + (elemColA << wordStrideBits) + val tileOffsetB = (elemRowB << rowStrideBBits) + (elemColB << wordStrideBits) + + (baseA + tileOffsetA, baseB + tileOffsetB) + } + + // FIXME: bogus base address + val (addressA, addressB) = + addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess) + val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), @@ -172,9 +214,7 @@ class TensorCoreDecoupled( sourceGen.io.gen := req.fire sourceGen.io.meta := tag req.valid := genReq - // FIXME: bogus address - // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B - req.bits.address := 0.U + req.bits.address := (if (i == 0) addressA else addressB) req.bits.source := sourceGen.io.id.bits sourceGen.io.reclaim.valid := resp.fire @@ -366,7 +406,7 @@ class TensorCoreDecoupled( // ---------------- // These queues hold metadata needed for writeback in sync with the DPU. - val queueDepth = 4 // needs to be at least the DPU latency + val queueDepth = 6 // needs to be at least the DPU latency val tagQueue = Module(new Queue( chiselTypeOf(operandATag), queueDepth )) @@ -397,7 +437,8 @@ class TensorCoreDecoupled( // TODO: decouple wid from frontend io.writeback.bits.wid := warpReg io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback) - io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) + io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) && + (substepWriteback === 1.U) // State transition // ---------------- From 64ea48ace3681e0a74a732fb4da006717e62b873 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 13:46:04 -0700 Subject: [PATCH 15/27] tensor: Consider data reuse for B memory request B is reused every 4 steps because of the k->i->j iteration order. --- .../radiance/core/TensorCoreDecoupled.scala | 111 ++++++++++-------- 1 file changed, 62 insertions(+), 49 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index f7c8547..897edb2 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -145,8 +145,6 @@ class TensorCoreDecoupled( // Memory traffic generation // ------------------------- // - val genReq = (state === TensorState.run) - class TensorMemTag extends Bundle { val set = UInt(setBits.W) val step = UInt(stepBits.W) @@ -159,16 +157,14 @@ class TensorCoreDecoupled( tag.step := stepAccess tag.substep := substepAccess + val numTilesM = tilingParams.m / tilingParams.mc + val numTilesN = tilingParams.n / tilingParams.nc // @cleanup: generalize in terms of M/N/K-majorness? def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt) : (UInt/*A*/, UInt/*B*/) = { // note that step iterates along N first, then M - val numComputeTilesM = tilingParams.m / tilingParams.mc - val numComputeTilesN = tilingParams.n / tilingParams.nc - val tileM = step % numComputeTilesM.U - val tileN = step / numComputeTilesM.U - val mcSubstep = tilingParams.mc / 2 - val ncSubstep = tilingParams.nc / 2 + val tileM = step % numTilesM.U + val tileN = step / numTilesM.U // note that both A and B are K-major to facilitate bank conflict-free SMEM // accesses @@ -180,11 +176,11 @@ class TensorCoreDecoupled( val tileColB = set // K // (row,col) coordinate of the starting element of the compute tile val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) + - (substep << log2Ceil(mcSubstep)) - val elemColA = tileColA << log2Ceil(tilingParams.kc) - val elemRowB = tileRowB << log2Ceil(tilingParams.nc) - (substep << log2Ceil(ncSubstep)) - val elemColB = tileColB << log2Ceil(tilingParams.kc) + (substep << log2Ceil(tilingParams.mc / 2)) + val elemColA = tileColA << log2Ceil(tilingParams.kc) + val elemRowB = (tileRowB << log2Ceil(tilingParams.nc)) + + (substep << log2Ceil(tilingParams.nc / 2)) + val elemColB = tileColB << log2Ceil(tilingParams.kc) val rowStrideA = wordSize * tilingParams.k val rowStrideABits = log2Ceil(rowStrideA) val rowStrideB = wordSize * tilingParams.k @@ -201,6 +197,13 @@ class TensorCoreDecoupled( val (addressA, addressB) = addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess) + val genReqA = (state === TensorState.run) + val numTilesMBits = log2Ceil(numTilesM) + // generate B request at every 4 steps. B achieves reuse through outer + // product so it doesn't require access at every step + val shouldFireB = (stepAccess & ((1 << numTilesMBits) - 1).U) === 0.U + val genReqB = (state === TensorState.run) && shouldFireB + val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), @@ -213,7 +216,7 @@ class TensorCoreDecoupled( sourceGen.io.gen := req.fire sourceGen.io.meta := tag - req.valid := genReq + req.valid := (if (i == 0) genReqA else genReqB) req.bits.address := (if (i == 0) addressA else addressB) req.bits.source := sourceGen.io.id.bits @@ -228,23 +231,27 @@ class TensorCoreDecoupled( } } - // only advance to the next step if we fired mem requests for both A and B - // TODO: @perf: too strict? should be able to have A and B progress - // separately - val firedABReg = RegInit(VecInit(false.B, false.B)) - val firedABNow = VecInit((Seq(io.reqA, io.reqB) zip firedABReg).map { - case (req, fired) => { when (req.fire) { fired := true.B } } - req.fire - }) - val firedAB = (firedABNow.asUInt | firedABReg.asUInt) - val nextSubstepAccess = firedAB.andR + // only advance to the next step if we fired mem requests for both A and B. + // also consider that B doesn't have to be fired every time due to reuse. + // @perf: too strict? should be able to have A and B progress separately + val firedAReg = RegInit(false.B) + val firedBReg = RegInit(false.B) + when (io.reqA.fire) { firedAReg := true.B } + when (io.reqB.fire) { firedBReg := true.B } + val firedANow = io.reqA.fire + val firedBNow = io.reqB.fire + val firedA = firedAReg || firedANow + val firedB = firedBReg || firedBNow + val nextSubstepAccess = firedA && (!shouldFireB || firedB) val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U) // clear out firedABReg every substep when (nextSubstepAccess) { - firedABReg := Seq(false.B, false.B) + firedAReg := false.B + firedBReg := false.B substepAccess := substepAccess + 1.U } require(substepAccess.widthOption.get == 1, "there should be only two substeps") + dontTouch(shouldFireB) // Execute stage // ------------- @@ -327,18 +334,26 @@ class TensorCoreDecoupled( respQueueA.ready := MuxCase(false.B, Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready, (substepExecute === 1.U) -> fullAQueue.io.enq.ready)) - respQueueB.ready := dpuFire + // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when + // we fully iterated a column (M-dimension). + val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U + val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask + respQueueB.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) + dontTouch(shouldDequeueB) - // assert that the DPU is computing with operands of the same set/step + // Assert that the DPU is computing with operands of the same set/step. Note + // that the B resp will only have step values multiple of 4 due to reuse. // - // this assumes that memory responses come back in-order. this might be too - // strong an assumption depending on the backing memory + // This check assumes that memory responses come back in-order. Might be too + // strong of an assumption depending on the backing memory. def assertAligned = { + val stepMask = (1 << numTilesMBits).U when (dpuFire) { assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) && - (fullAQueue.io.deq.bits.tag.step === respQueueB.bits.tag.step), + ((fullAQueue.io.deq.bits.tag.step & stepMask) === + (respQueueB.bits.tag.step & stepMask)), "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } @@ -348,26 +363,26 @@ class TensorCoreDecoupled( // Dot-product unit // // 4x2 four-element DPUs summing up to 32 MACs in total - val dpus = Seq.fill(4)(Seq.fill(2)( + val ncSubstep = tilingParams.nc / 2 + val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)( Module(new TensorDotProductUnit(half = false)) )) // operandA is 4x4 in K-major val operandADimensional = operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - .grouped(4).toSeq - assert(operandADimensional.length == tilingParams.mc && - operandADimensional(0).length == tilingParams.kc, - "operand width doesn't agree with tiling parameter") - // operandB is 2x4, i.e. 4x2 in N-major + .grouped(4/*k-dim*/).toSeq + require(operandADimensional.length == tilingParams.mc && + operandADimensional(0).length == tilingParams.kc, + "operand width doesn't agree with tiling parameter") + // operandB is 2x4 in K-major val operandBDimensional = operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - .grouped(4).toSeq - val ncSubstep = tilingParams.nc / 2 - assert(tilingParams.mc * ncSubstep == numLanes, - "substep tile size doesn't match writeback throughput") - assert(operandBDimensional.length == ncSubstep && - operandBDimensional(0).length == tilingParams.kc, - "operand width doesn't agree with tiling parameter") + .grouped(4/*k-dim*/).toSeq + require(tilingParams.mc * ncSubstep == numLanes, + "substep tile size doesn't match writeback throughput") + require(operandBDimensional.length == ncSubstep && + operandBDimensional(0).length == tilingParams.kc, + "operand width doesn't agree with tiling parameter") for (m <- 0 until tilingParams.mc) { for (n <- 0 until ncSubstep) { @@ -406,10 +421,8 @@ class TensorCoreDecoupled( // ---------------- // These queues hold metadata needed for writeback in sync with the DPU. - val queueDepth = 6 // needs to be at least the DPU latency - val tagQueue = Module(new Queue( - chiselTypeOf(operandATag), queueDepth - )) + val queueDepth = 5 // needs to be at least the DPU latency + val tagQueue = Module(new Queue(chiselTypeOf(operandATag), queueDepth)) tagQueue.io.enq.valid := dpuFire // A and B should have the same tags tagQueue.io.enq.bits := operandATag @@ -573,11 +586,11 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { val tensor = LazyModule(new TensorCoreDecoupledTL) val xbar = LazyModule(new TLXbar) val ramA = LazyModule(new TLRAM( - address = AddressSet(0x000, 0xfffeff), + address = AddressSet(0x000, 0xfffbff), beatBytes = 32 // @cleanup: hardcoded )) val ramB = LazyModule(new TLRAM( - address = AddressSet(0x100, 0xfffeff), + address = AddressSet(0x400, 0xfffbff), beatBytes = 32 // @cleanup: hardcoded )) From c2f39f74749df7fac8ba63d8900d6651eea72f71 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 16:21:43 -0700 Subject: [PATCH 16/27] tensor: Rename substepExecute --- .../radiance/core/TensorCoreDecoupled.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 897edb2..f7c6c63 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -269,13 +269,13 @@ class TensorCoreDecoupled( require(respQueueA.bits.data.widthOption.get == io.writeback.bits.data.widthOption.get, - "response data width does not match the writeback data width") + "response data width does not match the writeback data width") - val substepExecute = RegInit(0.U(1.W)) + val substepDeqA = RegInit(0.U(1.W)) when (respQueueA.fire) { - substepExecute := substepExecute + 1.U + substepDeqA := substepDeqA + 1.U } - dontTouch(substepExecute) + dontTouch(substepDeqA) // Do pipelining for the A operand so that we obtain the full 4x4 A tile // ready for compute. The pipeline is two-stage: @@ -292,7 +292,7 @@ class TensorCoreDecoupled( val halfAQueue = Module(new Queue( chiselTypeOf(respQueueA.bits), entries = 1, pipe = true )) - halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U) + halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U) halfAQueue.io.enq.bits := respQueueA.bits // substep == 0 data goes to the LSB @@ -305,9 +305,9 @@ class TensorCoreDecoupled( new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true )) // hold first half A data for the first substep - halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) && + halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) && fullAQueue.io.enq.ready - fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) && + fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) && halfAQueue.io.deq.valid fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag @@ -332,8 +332,8 @@ class TensorCoreDecoupled( // respQueueA output arbitrates to either halfAQueue or fullAQueue depending // on the substep respQueueA.ready := MuxCase(false.B, - Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready, - (substepExecute === 1.U) -> fullAQueue.io.enq.ready)) + Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready, + (substepDeqA === 1.U) -> fullAQueue.io.enq.ready)) // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U From 91d9897c277a1f0ab4e678bdd91f21eef7ac380d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 17:17:41 -0700 Subject: [PATCH 17/27] tensor: Write FillBuffer for tile buffering --- .../radiance/core/TensorCoreDecoupled.scala | 50 +++++++++++++++++-- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index f7c6c63..e70e59f 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -5,6 +5,7 @@ package radiance.core import chisel3._ import chisel3.util._ +import chisel3.experimental.requireIsChiselType import org.chipsalliance.cde.config.Parameters import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} import freechips.rocketchip.tilelink._ @@ -312,10 +313,17 @@ class TensorCoreDecoupled( fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag - val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid + val fillBufB = Module(new FillBuffer( + chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ + )) + fillBufB.io.enq.valid := respQueueB.valid + fillBufB.io.enq.bits := respQueueB.bits.data + respQueueB.ready := fillBufB.io.enq.ready + + val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid val operandA = fullAQueue.io.deq.bits.data val operandATag = fullAQueue.io.deq.bits.tag - val operandB = respQueueB.bits.data + val operandB = fillBufB.io.deq.bits val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val setCompute = fullAQueue.io.deq.bits.tag.set @@ -338,7 +346,7 @@ class TensorCoreDecoupled( // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask - respQueueB.ready := dpuFire && shouldDequeueB + fillBufB.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) dontTouch(shouldDequeueB) @@ -375,8 +383,11 @@ class TensorCoreDecoupled( operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") // operandB is 2x4 in K-major + // val operandBDimensional = + // operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + // .grouped(4/*k-dim*/).toSeq val operandBDimensional = - operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq require(tilingParams.mc * ncSubstep == numLanes, "substep tile size doesn't match writeback throughput") @@ -490,6 +501,37 @@ class TensorCoreDecoupled( } } +// A buffer that collects multiple entries of input data and exposes the +// coalesced data as output. Effectively acts as a width-widening +// chisel.util.Pipe. +class FillBuffer[T <: Data]( + gen: T, + entries: Int +) extends Module { + require(entries > 0, "FillBuffer must have a positive number of entries") + requireIsChiselType(gen) + + val io = IO(new Bundle { + val enq = Flipped(Decoupled(gen)) + val deq = Decoupled(Vec(entries, gen)) + }) + + val data = Reg(Vec(entries, gen)) + val ptr = Counter(entries + 1) + val full = (ptr.value === entries.U) + io.enq.ready := !full + when (io.enq.fire) { + data(ptr.value) := io.enq.bits + ptr.inc() + } + io.deq.valid := full + (io.deq.bits zip data).foreach { case (io, d) => io := d } + when (io.deq.fire) { + assert(ptr.value === entries.U, "FillBuffer fired before buffer was full") + ptr.reset() + } +} + // synthesizable unit tests // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy From 7fab6f89ad3e99de20e4aa5972be745c720b1e70 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 17:33:55 -0700 Subject: [PATCH 18/27] tensor: Properly route FillBuffer to DPU --- .../radiance/core/TensorCoreDecoupled.scala | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index e70e59f..206250e 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -313,17 +313,24 @@ class TensorCoreDecoupled( fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag - val fillBufB = Module(new FillBuffer( + // serialize every two B responses into one full 4x4 B tile + // FIXME: do the same for A + val fullB = Module(new FillBuffer( chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ )) - fillBufB.io.enq.valid := respQueueB.valid - fillBufB.io.enq.bits := respQueueB.bits.data - respQueueB.ready := fillBufB.io.enq.ready + fullB.io.enq.valid := respQueueB.valid + fullB.io.enq.bits := respQueueB.bits.data + respQueueB.ready := fullB.io.enq.ready + val fullBTag = Module(new Queue( + new TensorMemTag, entries = 1, pipe = true + )) + fullBTag.io.enq.valid := respQueueB.valid + fullBTag.io.enq.bits := respQueueB.bits.tag - val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid + val operandsValid = fullAQueue.io.deq.valid && fullB.io.deq.valid val operandA = fullAQueue.io.deq.bits.data val operandATag = fullAQueue.io.deq.bits.tag - val operandB = fillBufB.io.deq.bits + val operandB = fullB.io.deq.bits val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val setCompute = fullAQueue.io.deq.bits.tag.set @@ -333,10 +340,6 @@ class TensorCoreDecoupled( substepCompute := substepCompute + 1.U } - // hold full A until two-cycle compute is done - fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) - val nextStepExecute = dpuFire && (substepCompute === 1.U) - // respQueueA output arbitrates to either halfAQueue or fullAQueue depending // on the substep respQueueA.ready := MuxCase(false.B, @@ -345,12 +348,19 @@ class TensorCoreDecoupled( // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U - val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask - fillBufB.io.deq.ready := dpuFire && shouldDequeueB + val shouldDequeueB = + ((stepExecute & shouldDequeueBMask) === shouldDequeueBMask) && + (substepCompute === 1.U) + fullB.io.deq.ready := dpuFire && shouldDequeueB + fullBTag.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) dontTouch(shouldDequeueB) + // hold full A until two-cycle compute is done + fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) + val nextStepExecute = dpuFire && (substepCompute === 1.U) + // Assert that the DPU is computing with operands of the same set/step. Note // that the B resp will only have step values multiple of 4 due to reuse. // @@ -359,9 +369,9 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert((fullAQueue.io.deq.bits.tag.set === respQueueB.bits.tag.set) && + assert((fullAQueue.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) && ((fullAQueue.io.deq.bits.tag.step & stepMask) === - (respQueueB.bits.tag.step & stepMask)), + (fullBTag.io.deq.bits.step & stepMask)), "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } @@ -387,7 +397,7 @@ class TensorCoreDecoupled( // operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq // .grouped(4/*k-dim*/).toSeq val operandBDimensional = - operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq require(tilingParams.mc * ncSubstep == numLanes, "substep tile size doesn't match writeback throughput") From c4b5a11fdefbbfbe73b765bb1feece25d2a1d3f1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 19:54:20 -0700 Subject: [PATCH 19/27] tensor: Replace staging logic for A with FillBuffer --- .../radiance/core/TensorCoreDecoupled.scala | 77 ++++++++----------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 206250e..deb4dc1 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -272,46 +272,41 @@ class TensorCoreDecoupled( io.writeback.bits.data.widthOption.get, "response data width does not match the writeback data width") + // FIXME: unnecessary val substepDeqA = RegInit(0.U(1.W)) when (respQueueA.fire) { substepDeqA := substepDeqA + 1.U } dontTouch(substepDeqA) - // Do pipelining for the A operand so that we obtain the full 4x4 A tile - // ready for compute. The pipeline is two-stage: - // - stage one (halfAQueue) for assembling the full A tile from half-tiles - // coming from the resp queue, and - // - stage two (fullAQueue) for holding the full A tile until it gets - // matched with two 4x2 B tiles, and compute is complete. - // - // Note that the half-tile assembly is unnecessary for B since the B tile is - // only 4x2. - // Also send the set/step tag along the pipe for alignment check. + // Stage the operands in a pipeline so that we obtain the full 4x4 tiles + // ready for compute. Also send the set/step tag along the pipe for + // alignment check. - // note combinationally coupled ready with `pipe` - val halfAQueue = Module(new Queue( - chiselTypeOf(respQueueA.bits), entries = 1, pipe = true + val fullA = Module(new FillBuffer( + chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ )) - halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U) - halfAQueue.io.enq.bits := respQueueA.bits + fullA.io.enq.valid := respQueueA.valid + fullA.io.enq.bits := respQueueA.bits.data + respQueueA.ready := fullA.io.enq.ready + // `pipe` combinationally couples enq-deq ready + val fullATag = Module(new Queue( + new TensorMemTag, entries = 1, pipe = true + )) + fullATag.io.enq.valid := respQueueA.valid + fullATag.io.enq.bits := respQueueA.bits.tag - // substep == 0 data goes to the LSB - val fullAEnqData = Cat(respQueueA.bits.data, halfAQueue.io.deq.bits.data) - require(fullAEnqData.widthOption.get == dataWidth * 2, - "assumes 2-cycle read for a full compute tile of A") - // only use the lower halfA's tag. substep will be incorrect. - val fullAEnqTag = halfAQueue.io.deq.bits.tag - val fullAQueue = Module(new Queue( + // stage the full A tile once more so that FillBuffer can be filled up in the + // background while the tile is being used for compute. This does come with + // capacity overhead. + val fullABuf = Module(new Queue( new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true )) - // hold first half A data for the first substep - halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) && - fullAQueue.io.enq.ready - fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) && - halfAQueue.io.deq.valid - fullAQueue.io.enq.bits.data := fullAEnqData - fullAQueue.io.enq.bits.tag := fullAEnqTag + fullABuf.io.enq.valid := fullA.io.deq.valid + fullABuf.io.enq.bits.data := fullA.io.deq.bits.asUInt + fullABuf.io.enq.bits.tag := fullATag.io.deq.bits + fullA.io.deq.ready := fullABuf.io.enq.ready + fullATag.io.deq.ready := fullABuf.io.enq.ready // serialize every two B responses into one full 4x4 B tile // FIXME: do the same for A @@ -327,29 +322,24 @@ class TensorCoreDecoupled( fullBTag.io.enq.valid := respQueueB.valid fullBTag.io.enq.bits := respQueueB.bits.tag - val operandsValid = fullAQueue.io.deq.valid && fullB.io.deq.valid - val operandA = fullAQueue.io.deq.bits.data - val operandATag = fullAQueue.io.deq.bits.tag + val operandsValid = fullABuf.io.deq.valid && fullB.io.deq.valid + val operandA = fullABuf.io.deq.bits.data + val operandATag = fullABuf.io.deq.bits.tag val operandB = fullB.io.deq.bits val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady - val setCompute = fullAQueue.io.deq.bits.tag.set - val stepCompute = fullAQueue.io.deq.bits.tag.step + val setCompute = fullABuf.io.deq.bits.tag.set + val stepCompute = fullABuf.io.deq.bits.tag.step val substepCompute = RegInit(0.U(1.W)) when (dpuFire) { substepCompute := substepCompute + 1.U } - // respQueueA output arbitrates to either halfAQueue or fullAQueue depending - // on the substep - respQueueA.ready := MuxCase(false.B, - Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready, - (substepDeqA === 1.U) -> fullAQueue.io.enq.ready)) // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = - ((stepExecute & shouldDequeueBMask) === shouldDequeueBMask) && + ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) && (substepCompute === 1.U) fullB.io.deq.ready := dpuFire && shouldDequeueB fullBTag.io.deq.ready := dpuFire && shouldDequeueB @@ -358,7 +348,8 @@ class TensorCoreDecoupled( dontTouch(shouldDequeueB) // hold full A until two-cycle compute is done - fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U) + fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U) + // FIXME: this should be nextStepCompute val nextStepExecute = dpuFire && (substepCompute === 1.U) // Assert that the DPU is computing with operands of the same set/step. Note @@ -369,8 +360,8 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert((fullAQueue.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) && - ((fullAQueue.io.deq.bits.tag.step & stepMask) === + assert((fullABuf.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) && + ((fullABuf.io.deq.bits.tag.step & stepMask) === (fullBTag.io.deq.bits.step & stepMask)), "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") From 93c9bcc32f5b516f3bd51990ff60e22e0348f409 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 20:12:15 -0700 Subject: [PATCH 20/27] tensor: Stage B as well for full throughput --- .../radiance/core/TensorCoreDecoupled.scala | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index deb4dc1..90cb785 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -300,10 +300,13 @@ class TensorCoreDecoupled( // background while the tile is being used for compute. This does come with // capacity overhead. val fullABuf = Module(new Queue( - new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true + new Bundle { + val data = chiselTypeOf(fullA.io.deq.bits) + val tag = new TensorMemTag + }, entries = 1, pipe = true )) fullABuf.io.enq.valid := fullA.io.deq.valid - fullABuf.io.enq.bits.data := fullA.io.deq.bits.asUInt + fullABuf.io.enq.bits.data := fullA.io.deq.bits fullABuf.io.enq.bits.tag := fullATag.io.deq.bits fullA.io.deq.ready := fullABuf.io.enq.ready fullATag.io.deq.ready := fullABuf.io.enq.ready @@ -322,10 +325,22 @@ class TensorCoreDecoupled( fullBTag.io.enq.valid := respQueueB.valid fullBTag.io.enq.bits := respQueueB.bits.tag - val operandsValid = fullABuf.io.deq.valid && fullB.io.deq.valid + val fullBBuf = Module(new Queue( + new Bundle { + val data = chiselTypeOf(fullB.io.deq.bits) + val tag = new TensorMemTag + }, entries = 1, pipe = true + )) + fullBBuf.io.enq.valid := fullB.io.deq.valid + fullBBuf.io.enq.bits.data := fullB.io.deq.bits + fullBBuf.io.enq.bits.tag := fullBTag.io.deq.bits + fullB.io.deq.ready := fullBBuf.io.enq.ready + fullBTag.io.deq.ready := fullBBuf.io.enq.ready + + val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid val operandA = fullABuf.io.deq.bits.data val operandATag = fullABuf.io.deq.bits.tag - val operandB = fullB.io.deq.bits + val operandB = fullBBuf.io.deq.bits.data val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val setCompute = fullABuf.io.deq.bits.tag.set @@ -335,20 +350,19 @@ class TensorCoreDecoupled( substepCompute := substepCompute + 1.U } + // hold full A until two-cycle compute is done + fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U) // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) && (substepCompute === 1.U) - fullB.io.deq.ready := dpuFire && shouldDequeueB - fullBTag.io.deq.ready := dpuFire && shouldDequeueB + fullBBuf.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) dontTouch(shouldDequeueB) - // hold full A until two-cycle compute is done - fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U) // FIXME: this should be nextStepCompute val nextStepExecute = dpuFire && (substepCompute === 1.U) @@ -360,9 +374,9 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert((fullABuf.io.deq.bits.tag.set === fullBTag.io.deq.bits.set) && + assert((fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set) && ((fullABuf.io.deq.bits.tag.step & stepMask) === - (fullBTag.io.deq.bits.step & stepMask)), + (fullBBuf.io.deq.bits.tag.step & stepMask)), "A and B operands are pointing to different set/steps. " ++ "This might indicate memory response coming back out-of-order.") } @@ -378,15 +392,12 @@ class TensorCoreDecoupled( )) // operandA is 4x4 in K-major val operandADimensional = - operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + operandA.asUInt.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq require(operandADimensional.length == tilingParams.mc && operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") - // operandB is 2x4 in K-major - // val operandBDimensional = - // operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - // .grouped(4/*k-dim*/).toSeq + // select 2x4 subtile out of operandB that is 4x4 in K-major val operandBDimensional = operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq From c0292dd0aa97a8cec3d034b20e2a167f78e54af8 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 21:51:34 -0700 Subject: [PATCH 21/27] tensor: Enlarge operand buffer for A for better SMEM reuse --- .../radiance/core/TensorCoreDecoupled.scala | 159 +++++++++++------- 1 file changed, 100 insertions(+), 59 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 90cb785..fa3f6e9 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -146,18 +146,6 @@ class TensorCoreDecoupled( // Memory traffic generation // ------------------------- // - class TensorMemTag extends Bundle { - val set = UInt(setBits.W) - val step = UInt(stepBits.W) - val substep = UInt(1.W) - } - // use concatenation of set/step as the memory request source. This will get - // translated to the actual TL sourcewidth in sourceGen. - val tag = Wire(new TensorMemTag) - tag.set := setAccess - tag.step := stepAccess - tag.substep := substepAccess - val numTilesM = tilingParams.m / tilingParams.mc val numTilesN = tilingParams.n / tilingParams.nc // @cleanup: generalize in terms of M/N/K-majorness? @@ -198,12 +186,41 @@ class TensorCoreDecoupled( val (addressA, addressB) = addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess) + // 'index' is the index of a memory request among the sequence of requests + // needed to read a full M-column of A or N-row of B. Its range is [0,m/2) + // or [0,n/2), where 2 is the stride can be read in a single request size. + require(tilingParams.m == tilingParams.n, + "currently only supports square SMEM tile") + val numIndices = tilingParams.m / 2 + val indexBits = log2Ceil(numIndices) + val lastIndex = (1 << indexBits) - 1 + + class TensorMemTag extends Bundle { + val set = UInt(setBits.W) + val index = UInt(indexBits.W) + } + + val tagInit = Wire(new TensorMemTag) + tagInit.set := 0.U + tagInit.index := 0.U + val tagA = RegInit(tagInit) + val tagB = RegInit(tagInit) + + when (io.reqA.fire) { + when (tagA.index === lastIndex.U) { + tagA.set := tagA.set + 1.U + } + tagA.index := tagA.index + 1.U + } + when (io.reqB.fire) { + when (tagB.index === lastIndex.U) { + tagB.set := tagB.set + 1.U + } + tagB.index := tagB.index + 1.U + } + val genReqA = (state === TensorState.run) - val numTilesMBits = log2Ceil(numTilesM) - // generate B request at every 4 steps. B achieves reuse through outer - // product so it doesn't require access at every step - val shouldFireB = (stepAccess & ((1 << numTilesMBits) - 1).U) === 0.U - val genReqB = (state === TensorState.run) && shouldFireB + val genReqB = (state === TensorState.run) val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) @@ -212,11 +229,11 @@ class TensorCoreDecoupled( case ((req, (resp, respTagged)), i) => { val sourceGen = Module(new SourceGenerator( log2Ceil(numSourceIds), - metadata = Some(tag) + metadata = Some(new TensorMemTag) )) sourceGen.io.gen := req.fire - sourceGen.io.meta := tag + sourceGen.io.meta := (if (i == 0) tagA else tagB) req.valid := (if (i == 0) genReqA else genReqB) req.bits.address := (if (i == 0) addressA else addressB) req.bits.source := sourceGen.io.id.bits @@ -243,7 +260,7 @@ class TensorCoreDecoupled( val firedBNow = io.reqB.fire val firedA = firedAReg || firedANow val firedB = firedBReg || firedBNow - val nextSubstepAccess = firedA && (!shouldFireB || firedB) + val nextSubstepAccess = firedA && firedB val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U) // clear out firedABReg every substep when (nextSubstepAccess) { @@ -252,17 +269,12 @@ class TensorCoreDecoupled( substepAccess := substepAccess + 1.U } require(substepAccess.widthOption.get == 1, "there should be only two substeps") - dontTouch(shouldFireB) // Execute stage // ------------- // Backend of the decoupled access/execute pipeline. // // set and step being currently executed in the acc/ex backend - val setExecute = RegInit(0.U(setBits.W)) - val stepExecute = RegInit(0.U(stepBits.W)) - dontTouch(setExecute) - dontTouch(stepExecute) val respQueueDepth = 4 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) @@ -283,8 +295,10 @@ class TensorCoreDecoupled( // ready for compute. Also send the set/step tag along the pipe for // alignment check. + // @cleanup: dedup A and B below + val fullA = Module(new FillBuffer( - chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ + chiselTypeOf(respQueueB.bits.data), numIndices )) fullA.io.enq.valid := respQueueA.valid fullA.io.enq.bits := respQueueA.bits.data @@ -337,23 +351,48 @@ class TensorCoreDecoupled( fullB.io.deq.ready := fullBBuf.io.enq.ready fullBTag.io.deq.ready := fullBBuf.io.enq.ready - val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid - val operandA = fullABuf.io.deq.bits.data - val operandATag = fullABuf.io.deq.bits.tag - val operandB = fullBBuf.io.deq.bits.data val dpuReady = Wire(Bool()) + val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid val dpuFire = operandsValid && dpuReady - val setCompute = fullABuf.io.deq.bits.tag.set - val stepCompute = fullABuf.io.deq.bits.tag.step + + val setCompute = RegInit(0.U(setBits.W)) + val stepCompute = RegInit(0.U(stepBits.W)) val substepCompute = RegInit(0.U(1.W)) + val nextStepCompute = dpuFire && (substepCompute === 1.U) + dontTouch(setCompute) + dontTouch(stepCompute) + dontTouch(substepCompute) when (dpuFire) { substepCompute := substepCompute + 1.U } - // hold full A until two-cycle compute is done - fullABuf.io.deq.ready := dpuFire && (substepCompute === 1.U) - // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when - // we fully iterated a column (M-dimension). + // Operand selection + // + // select the correct 4x4 tile from A operand buffer + val numTilesMBits = log2Ceil(numTilesM) + def selectOperandA(buf: Vec[UInt]): UInt = { + require(buf.length == numIndices) + val stepM = stepCompute & ((1 << numTilesMBits) - 1).U + Cat(buf((stepM << 1) + 1.U), buf(stepM << 1)) + } + val operandA = selectOperandA(fullABuf.io.deq.bits.data) + val operandATag = fullABuf.io.deq.bits.tag + // select the correct 2x4 tile from B operand buffer + val operandB = fullBBuf.io.deq.bits.data(substepCompute) + val operandBTag = fullBBuf.io.deq.bits.tag + dontTouch(operandATag) + dontTouch(operandBTag) + + // Operand buffer dequeue logic + // + // hold A data until the entire set is done + val shouldDequeueAMask = ((1 << stepBits) - 1).U + val shouldDequeueA = + ((stepCompute & shouldDequeueAMask) === shouldDequeueAMask) && + (substepCompute === 1.U) + fullABuf.io.deq.ready := dpuFire && shouldDequeueA + // hold B tile at respQueueB for multiple steps for reuse, only dequeue when + // we fully iterated a column (M-dimension) val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) && @@ -361,11 +400,9 @@ class TensorCoreDecoupled( fullBBuf.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) + dontTouch(shouldDequeueA) dontTouch(shouldDequeueB) - // FIXME: this should be nextStepCompute - val nextStepExecute = dpuFire && (substepCompute === 1.U) - // Assert that the DPU is computing with operands of the same set/step. Note // that the B resp will only have step values multiple of 4 due to reuse. // @@ -374,11 +411,9 @@ class TensorCoreDecoupled( def assertAligned = { val stepMask = (1 << numTilesMBits).U when (dpuFire) { - assert((fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set) && - ((fullABuf.io.deq.bits.tag.step & stepMask) === - (fullBBuf.io.deq.bits.tag.step & stepMask)), - "A and B operands are pointing to different set/steps. " ++ - "This might indicate memory response coming back out-of-order.") + assert(fullABuf.io.deq.bits.tag.set === fullBBuf.io.deq.bits.tag.set, + "A and B operands are pointing to different sets. " ++ + "This might indicate memory response coming back out-of-order.") } } assertAligned @@ -386,23 +421,24 @@ class TensorCoreDecoupled( // Dot-product unit // // 4x2 four-element DPUs summing up to 32 MACs in total + // val ncSubstep = tilingParams.nc / 2 + require(tilingParams.mc * ncSubstep == numLanes, + "substep tile size doesn't match writeback throughput") val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)( Module(new TensorDotProductUnit(half = false)) )) - // operandA is 4x4 in K-major - val operandADimensional = - operandA.asUInt.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - .grouped(4/*k-dim*/).toSeq + + // reshape operands for easier routing to DPU + def reshapeByFourWords(x: UInt): Seq[Seq[UInt]] = { + x.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + .grouped(4/*k-dim*/).toSeq + } + val operandADimensional = reshapeByFourWords(operandA) require(operandADimensional.length == tilingParams.mc && operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") - // select 2x4 subtile out of operandB that is 4x4 in K-major - val operandBDimensional = - operandB(substepCompute).asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq - .grouped(4/*k-dim*/).toSeq - require(tilingParams.mc * ncSubstep == numLanes, - "substep tile size doesn't match writeback throughput") + val operandBDimensional = reshapeByFourWords(operandB) require(operandBDimensional.length == ncSubstep && operandBDimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") @@ -444,12 +480,17 @@ class TensorCoreDecoupled( // ---------------- // These queues hold metadata needed for writeback in sync with the DPU. + class TensorComputeTag extends Bundle { + val set = UInt(setBits.W) + val step = UInt(stepBits.W) + val substep = UInt(1.W) + } + val queueDepth = 5 // needs to be at least the DPU latency - val tagQueue = Module(new Queue(chiselTypeOf(operandATag), queueDepth)) + val tagQueue = Module(new Queue(new TensorComputeTag, queueDepth)) tagQueue.io.enq.valid := dpuFire - // A and B should have the same tags - tagQueue.io.enq.bits := operandATag - // @cleanup: awkward + tagQueue.io.enq.bits.set := setCompute + tagQueue.io.enq.bits.step := stepCompute tagQueue.io.enq.bits.substep := substepCompute tagQueue.io.deq.ready := io.writeback.fire assert(tagQueue.io.enq.ready === true.B, @@ -490,7 +531,7 @@ class TensorCoreDecoupled( } } sequenceSetStep(setAccess, stepAccess, nextStepAccess) - sequenceSetStep(setExecute, stepExecute, nextStepExecute) + sequenceSetStep(setCompute, stepCompute, nextStepCompute) switch(state) { is(TensorState.idle) { From 0aadc6074ad32fccb13118f8e0915d8b76f2a267 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 22:42:41 -0700 Subject: [PATCH 22/27] tensor: Decouple A and B access states Get rid of set/stepAccess states and let A and B access progress independently. --- .../radiance/core/TensorCoreDecoupled.scala | 200 ++++++++---------- 1 file changed, 88 insertions(+), 112 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index fa3f6e9..ed241b5 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -82,15 +82,6 @@ class TensorCoreDecoupled( // This drives the overall pipeline of memory requests, dot-product unit // operations and regfile writeback. - object TensorState extends ChiselEnum { - val idle = Value(0.U) - val run = Value(1.U) - // All set/step sequencing is complete and the tensor core is holding the - // result data until downstream writeback is ready. - // FIXME: is this necessary if writeback is decoupled with queues? - val finish = Value(2.U) - } - val state = RegInit(TensorState.idle) val busy = RegInit(false.B) // Holds the warp id the core is currently working on. Note that we only // support one outstanding warp request @@ -107,22 +98,10 @@ class TensorCoreDecoupled( def setDone(set: UInt) = (set === lastSet.U) def stepDone(step: UInt) = (step === lastStep.U) - // set and step being currently accessed in the acc/ex frontend - val setAccess = RegInit(0.U(setBits.W)) - val stepAccess = RegInit(0.U(stepBits.W)) - // we need full 4x4 A tile to fire DPU, but since the memory width is 8 - // words, we need 2 cycles to read A. `substep` tells which cycle we're at. - val substepAccess = RegInit(0.U(1.W)) - dontTouch(setAccess) - dontTouch(stepAccess) - dontTouch(substepAccess) - - when(io.initiate.fire) { + when (io.initiate.fire) { val wid = io.initiate.bits.wid busy := true.B warpReg := wid - setAccess := 0.U - stepAccess := 0.U when(io.writeback.fire) { assert( io.writeback.bits.wid =/= wid, @@ -143,55 +122,51 @@ class TensorCoreDecoupled( // serialize every HGMMA request io.initiate.ready := !busy - // Memory traffic generation - // ------------------------- + // =========================================================================== + // Access stage + // =========================================================================== // - val numTilesM = tilingParams.m / tilingParams.mc - val numTilesN = tilingParams.n / tilingParams.nc - // @cleanup: generalize in terms of M/N/K-majorness? - def addressGen(baseA: UInt, baseB: UInt, set: UInt, step: UInt, substep: UInt) - : (UInt/*A*/, UInt/*B*/) = { - // note that step iterates along N first, then M - val tileM = step % numTilesM.U - val tileN = step / numTilesM.U + // Frontend of the decoupled access/execute pipeline. - // note that both A and B are K-major to facilitate bank conflict-free SMEM - // accesses - // - // (row,col) coordinate of the compute tile - val tileRowA = tileM // M - val tileColA = set // K - val tileRowB = tileN // N - val tileColB = set // K - // (row,col) coordinate of the starting element of the compute tile - val elemRowA = (tileRowA << log2Ceil(tilingParams.mc)) + - (substep << log2Ceil(tilingParams.mc / 2)) - val elemColA = tileColA << log2Ceil(tilingParams.kc) - val elemRowB = (tileRowB << log2Ceil(tilingParams.nc)) + - (substep << log2Ceil(tilingParams.nc / 2)) - val elemColB = tileColB << log2Ceil(tilingParams.kc) - val rowStrideA = wordSize * tilingParams.k - val rowStrideABits = log2Ceil(rowStrideA) - val rowStrideB = wordSize * tilingParams.k - val rowStrideBBits = log2Ceil(rowStrideB) - val wordStrideBits = log2Ceil(wordSize) - - val tileOffsetA = (elemRowA << rowStrideABits) + (elemColA << wordStrideBits) - val tileOffsetB = (elemRowB << rowStrideBBits) + (elemColB << wordStrideBits) - - (baseA + tileOffsetA, baseB + tileOffsetB) + // States + // + object AccessorState extends ChiselEnum { + val idle = Value(0.U) + val access = Value(1.U) + // All set/step sequencing is complete and the tensor core is holding the + // result data until downstream writeback is ready. + // FIXME: is this necessary if writeback is decoupled with queues? + val finish = Value(2.U) } + val state = RegInit(AccessorState.idle) + val allReqsDone = WireInit(false.B) + dontTouch(allReqsDone) - // FIXME: bogus base address - val (addressA, addressB) = - addressGen(0.U, 0.U, setAccess, stepAccess, substepAccess) + switch(state) { + is(AccessorState.idle) { + when(io.initiate.fire) { + state := AccessorState.access + } + } + is(AccessorState.access) { + when (allReqsDone) { + state := AccessorState.finish + } + } + is(AccessorState.finish) { + // FIXME: decouple writeback + when(io.writeback.fire) { + state := AccessorState.idle + } + } + } // 'index' is the index of a memory request among the sequence of requests // needed to read a full M-column of A or N-row of B. Its range is [0,m/2) // or [0,n/2), where 2 is the stride can be read in a single request size. require(tilingParams.m == tilingParams.n, "currently only supports square SMEM tile") - val numIndices = tilingParams.m / 2 + val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/ val indexBits = log2Ceil(numIndices) val lastIndex = (1 << indexBits) - 1 @@ -219,9 +194,51 @@ class TensorCoreDecoupled( tagB.index := tagB.index + 1.U } - val genReqA = (state === TensorState.run) - val genReqB = (state === TensorState.run) + // Address generation + // + def addressGen(base: UInt, set: UInt, index: UInt): UInt = { + // note that both A and B are K-major to facilitate bank conflict-free SMEM + // accesses, so that below code applies to both. + // + // (row,col) coordinate of the compute tile + val tileRow = index + val tileCol = set + // (row,col) coordinate of the starting element of the compute tile + val elemRow = index << 1 + val elemCol = tileCol << log2Ceil(tilingParams.kc) + val rowStride = tilingParams.k * wordSize + val rowStrideBits = log2Ceil(rowStride) + val wordStrideBits = log2Ceil(wordSize) + val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits) + base + tileOffset + } + + // FIXME: bogus base address + val addressA = addressGen(0.U, tagA.set, tagA.index) + val addressB = addressGen(0.U, tagB.set, tagB.index) + + val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U) + val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U) + val doneReqA = RegInit(false.B) + val doneReqB = RegInit(false.B) + when (lastReqA && io.reqA.fire) { doneReqA := true.B } + when (lastReqB && io.reqB.fire) { doneReqB := true.B } + val genReqA = (state === AccessorState.access) && !doneReqA + val genReqB = (state === AccessorState.access) && !doneReqA + when (state === AccessorState.finish) { + doneReqA := false.B + doneReqB := false.B + tagA.set := 0.U + tagA.index := 0.U + tagB.set := 0.U + tagB.index := 0.U + } + + allReqsDone := doneReqA && doneReqB + + // Request generation + // val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth))) Seq((io.reqA, (io.respA, respATagged)), @@ -249,34 +266,13 @@ class TensorCoreDecoupled( } } - // only advance to the next step if we fired mem requests for both A and B. - // also consider that B doesn't have to be fired every time due to reuse. - // @perf: too strict? should be able to have A and B progress separately - val firedAReg = RegInit(false.B) - val firedBReg = RegInit(false.B) - when (io.reqA.fire) { firedAReg := true.B } - when (io.reqB.fire) { firedBReg := true.B } - val firedANow = io.reqA.fire - val firedBNow = io.reqB.fire - val firedA = firedAReg || firedANow - val firedB = firedBReg || firedBNow - val nextSubstepAccess = firedA && firedB - val nextStepAccess = nextSubstepAccess && (substepAccess === 1.U) - // clear out firedABReg every substep - when (nextSubstepAccess) { - firedAReg := false.B - firedBReg := false.B - substepAccess := substepAccess + 1.U - } - require(substepAccess.widthOption.get == 1, "there should be only two substeps") - + // =========================================================================== // Execute stage - // ------------- + // =========================================================================== + // // Backend of the decoupled access/execute pipeline. // - // set and step being currently executed in the acc/ex backend - - val respQueueDepth = 4 // FIXME: parameterize + val respQueueDepth = 8 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) @@ -369,6 +365,7 @@ class TensorCoreDecoupled( // Operand selection // // select the correct 4x4 tile from A operand buffer + val numTilesM = tilingParams.m / tilingParams.mc val numTilesMBits = log2Ceil(numTilesM) def selectOperandA(buf: Vec[UInt]): UInt = { require(buf.length == numIndices) @@ -383,7 +380,7 @@ class TensorCoreDecoupled( dontTouch(operandATag) dontTouch(operandBTag) - // Operand buffer dequeue logic + // Operand buffer logic // // hold A data until the entire set is done val shouldDequeueAMask = ((1 << stepBits) - 1).U @@ -476,8 +473,8 @@ class TensorCoreDecoupled( } io.writeback.bits.data := flattenedDPUOut - // Writeback queues - // ---------------- + // Writeback logic + // // These queues hold metadata needed for writeback in sync with the DPU. class TensorComputeTag extends Bundle { @@ -530,28 +527,7 @@ class TensorCoreDecoupled( } } } - sequenceSetStep(setAccess, stepAccess, nextStepAccess) sequenceSetStep(setCompute, stepCompute, nextStepCompute) - - switch(state) { - is(TensorState.idle) { - when(io.initiate.fire) { - state := TensorState.run - } - } - is(TensorState.run) { - when (setDone(setAccess) && stepDone(stepAccess) && nextStepAccess) { - when (state === TensorState.run) { - state := TensorState.finish - } - } - } - is(TensorState.finish) { - when(io.writeback.fire) { - state := TensorState.idle - } - } - } } // A buffer that collects multiple entries of input data and exposes the From e946403d7863abaec61b4baa92da467617d3fe66 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 22:54:48 -0700 Subject: [PATCH 23/27] tensor: Fix typo, reduce resp queue depth --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index ed241b5..b899ce9 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -225,7 +225,7 @@ class TensorCoreDecoupled( when (lastReqA && io.reqA.fire) { doneReqA := true.B } when (lastReqB && io.reqB.fire) { doneReqB := true.B } val genReqA = (state === AccessorState.access) && !doneReqA - val genReqB = (state === AccessorState.access) && !doneReqA + val genReqB = (state === AccessorState.access) && !doneReqB when (state === AccessorState.finish) { doneReqA := false.B doneReqB := false.B @@ -272,7 +272,7 @@ class TensorCoreDecoupled( // // Backend of the decoupled access/execute pipeline. // - val respQueueDepth = 8 // FIXME: parameterize + val respQueueDepth = 2 // FIXME: parameterize val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) From b3c328b1be7bf924fdddc285fcf5181d8f55c6cf Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 18 Oct 2024 23:11:19 -0700 Subject: [PATCH 24/27] tensor: Assert minimum response queue depth with doc --- src/main/scala/radiance/core/TensorCoreDecoupled.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index b899ce9..cd3bfa4 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -272,7 +272,13 @@ class TensorCoreDecoupled( // // Backend of the decoupled access/execute pipeline. // - val respQueueDepth = 2 // FIXME: parameterize + val respQueueDepth = 4 // FIXME: parameterize + require(respQueueDepth >= 4, + "respQueueDepth must be at least 4. This is because the B operand buffer " ++ + "is shallower than A's, so the B response queue has to be deep enough to " ++ + "hold younger requests until A operand buffer becomes valid and the first DPU " ++ + "fire can happen. FIXME: make operand buffer report per-subtile valid so " ++ + "the first compute can happen earlier.") val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) @@ -547,6 +553,7 @@ class FillBuffer[T <: Data]( val data = Reg(Vec(entries, gen)) val ptr = Counter(entries + 1) + dontTouch(ptr.value) val full = (ptr.value === entries.U) io.enq.ready := !full when (io.enq.fire) { From a98cb32343810994737e60446c7b0c5d975a6f37 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 21 Oct 2024 21:56:36 -0700 Subject: [PATCH 25/27] tensor: Inject stalls to A ram for fuzzing --- .../radiance/core/TensorCoreDecoupled.scala | 26 +++++++++++++++++-- .../scala/radiance/memory/Coalescing.scala | 3 ++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index cd3bfa4..c53ab81 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -216,7 +216,7 @@ class TensorCoreDecoupled( // FIXME: bogus base address val addressA = addressGen(0.U, tagA.set, tagA.index) - val addressB = addressGen(0.U, tagB.set, tagB.index) + val addressB = addressGen(0x400.U, tagB.set, tagB.index) val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U) val lastReqB = (tagB.set === lastSet.U) && (tagB.index === lastIndex.U) @@ -672,14 +672,36 @@ class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule { beatBytes = 32 // @cleanup: hardcoded )) + val stutter = new TLIdentityNode xbar.node :=* tensor.node - ramA.node := xbar.node + ramA.node := stutter := xbar.node ramB.node := xbar.node + val fuzz = true + lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { tensor.module.io.start := io.start io.finished := tensor.module.io.finished + + val (tlIn, _) = stutter.in(0) + val (tlOut, _) = stutter.out(0) + require(stutter.in.length == 1) + require(stutter.out.length == 1) + + // inject stalls for fuzzing + val incr = Wire(Bool()) + val (count, _) = Counter(incr, 0x1000) + def cond(x: UInt) = (x & ((1 << 3) - 1).U) =/= 0.U + val stall = if (fuzz) cond(count) else false.B + + tlOut.a <> tlIn.a + tlIn.d <> tlOut.d + incr := tlIn.a.fire || stall + when (stall) { + tlIn.a.ready := false.B + tlOut.a.valid := false.B + } } } diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index cac5e95..a21daee 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -372,7 +372,8 @@ class SourceGenerator[T <: Data]( outstanding := outstanding + 1.U } }.elsewhen(io.reclaim.valid) { - assert(outstanding > 0.U) + assert(outstanding > 0.U, + "Over-reclaim. Did some responses get dropped?") outstanding := outstanding - 1.U } dontTouch(outstanding) From 408888ae8f0f05364412ddba8246d9adf7502f87 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 21 Oct 2024 22:38:29 -0700 Subject: [PATCH 26/27] tensor: addPath()s for hopper generated chisel FIXME: SourceGenerator has a name-clash. --- src/main/scala/radiance/tile/VortexCore.scala | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index d6561e3..26c6989 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -128,7 +128,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) "NUM_THREADS" -> tile.numLsuLanes ) ) - with HasBlackBoxResource { + with HasBlackBoxResource with HasBlackBoxPath { // addResource("/vsrc/vortex/hw/unit_tests/generic_queue/testbench.v") // addResource("/vsrc/vortex/hw/unit_tests/VX_divide_tb.v") // addResource("/vsrc/vortex/hw/syn/synopsys/models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0_rtl.v") @@ -398,6 +398,34 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv") // addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh") + def addHopperTensorCore = { + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRawFN.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRecFN.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/DotProductPipe.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer_1.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/metadataTable_4x5.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/MulFullRawFN.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/occupancyTable_4x1.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon_1.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorMemTag.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue4_TensorMemRespWithTag.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue5_TensorComputeTag.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_4x261.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_5x7.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is26_oe8_os24.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is47_oe8_os24.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundRawFNToRecFN_e8_s24.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SimpleTimer.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SourceGenerator.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_1.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_2.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorCoreDecoupled.sv") + addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorDotProductUnit.sv") + } + addHopperTensorCore addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv") addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv") From 0fe2b3b07e5a5210cdb1cb5f92f68596b92ff6fb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 21 Oct 2024 22:39:28 -0700 Subject: [PATCH 27/27] Bump vortex --- src/main/resources/vsrc/vortex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 4dcbc31..0f06afc 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 4dcbc31a88915fff35ccefd00c6e753fa5ef135a +Subproject commit 0f06afc3ef7350e82c008f5f25395abf89879213