diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index f7c6c63..e70e59f 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -5,6 +5,7 @@ package radiance.core import chisel3._ import chisel3.util._ +import chisel3.experimental.requireIsChiselType import org.chipsalliance.cde.config.Parameters import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} import freechips.rocketchip.tilelink._ @@ -312,10 +313,17 @@ class TensorCoreDecoupled( fullAQueue.io.enq.bits.data := fullAEnqData fullAQueue.io.enq.bits.tag := fullAEnqTag - val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid + val fillBufB = Module(new FillBuffer( + chiselTypeOf(respQueueB.bits.data), 2/*substeps*/ + )) + fillBufB.io.enq.valid := respQueueB.valid + fillBufB.io.enq.bits := respQueueB.bits.data + respQueueB.ready := fillBufB.io.enq.ready + + val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid val operandA = fullAQueue.io.deq.bits.data val operandATag = fullAQueue.io.deq.bits.tag - val operandB = respQueueB.bits.data + val operandB = fillBufB.io.deq.bits val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady val setCompute = fullAQueue.io.deq.bits.tag.set @@ -338,7 +346,7 @@ class TensorCoreDecoupled( // we fully iterated a column (M-dimension). val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask - respQueueB.ready := dpuFire && shouldDequeueB + fillBufB.io.deq.ready := dpuFire && shouldDequeueB dontTouch(respQueueA) dontTouch(respQueueB) dontTouch(shouldDequeueB) @@ -375,8 +383,11 @@ class TensorCoreDecoupled( operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") // operandB is 2x4 in K-major + // val operandBDimensional = + // operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + // .grouped(4/*k-dim*/).toSeq val operandBDimensional = - operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq + operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4/*k-dim*/).toSeq require(tilingParams.mc * ncSubstep == numLanes, "substep tile size doesn't match writeback throughput") @@ -490,6 +501,37 @@ class TensorCoreDecoupled( } } +// A buffer that collects multiple entries of input data and exposes the +// coalesced data as output. Effectively acts as a width-widening +// chisel.util.Pipe. +class FillBuffer[T <: Data]( + gen: T, + entries: Int +) extends Module { + require(entries > 0, "FillBuffer must have a positive number of entries") + requireIsChiselType(gen) + + val io = IO(new Bundle { + val enq = Flipped(Decoupled(gen)) + val deq = Decoupled(Vec(entries, gen)) + }) + + val data = Reg(Vec(entries, gen)) + val ptr = Counter(entries + 1) + val full = (ptr.value === entries.U) + io.enq.ready := !full + when (io.enq.fire) { + data(ptr.value) := io.enq.bits + ptr.inc() + } + io.deq.valid := full + (io.deq.bits zip data).foreach { case (io, d) => io := d } + when (io.deq.fire) { + assert(ptr.value === entries.U, "FillBuffer fired before buffer was full") + ptr.reset() + } +} + // synthesizable unit tests // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy