tensor: Write FillBuffer for tile buffering

This commit is contained in:
Hansung Kim
2024-10-18 17:17:41 -07:00
parent c2f39f7474
commit 91d9897c27

View File

@@ -5,6 +5,7 @@ package radiance.core
import chisel3._
import chisel3.util._
import chisel3.experimental.requireIsChiselType
import org.chipsalliance.cde.config.Parameters
import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
import freechips.rocketchip.tilelink._
@@ -312,10 +313,17 @@ class TensorCoreDecoupled(
fullAQueue.io.enq.bits.data := fullAEnqData
fullAQueue.io.enq.bits.tag := fullAEnqTag
val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid
val fillBufB = Module(new FillBuffer(
chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
))
fillBufB.io.enq.valid := respQueueB.valid
fillBufB.io.enq.bits := respQueueB.bits.data
respQueueB.ready := fillBufB.io.enq.ready
val operandsValid = fullAQueue.io.deq.valid && fillBufB.io.deq.valid
val operandA = fullAQueue.io.deq.bits.data
val operandATag = fullAQueue.io.deq.bits.tag
val operandB = respQueueB.bits.data
val operandB = fillBufB.io.deq.bits
val dpuReady = Wire(Bool())
val dpuFire = operandsValid && dpuReady
val setCompute = fullAQueue.io.deq.bits.tag.set
@@ -338,7 +346,7 @@ class TensorCoreDecoupled(
// we fully iterated a column (M-dimension).
val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
val shouldDequeueB = (stepExecute & shouldDequeueBMask) === shouldDequeueBMask
respQueueB.ready := dpuFire && shouldDequeueB
fillBufB.io.deq.ready := dpuFire && shouldDequeueB
dontTouch(respQueueA)
dontTouch(respQueueB)
dontTouch(shouldDequeueB)
@@ -375,8 +383,11 @@ class TensorCoreDecoupled(
operandADimensional(0).length == tilingParams.kc,
"operand width doesn't agree with tiling parameter")
// operandB is 2x4 in K-major
// val operandBDimensional =
// operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
// .grouped(4/*k-dim*/).toSeq
val operandBDimensional =
operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
operandB(0)/*FIXME!*/.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
.grouped(4/*k-dim*/).toSeq
require(tilingParams.mc * ncSubstep == numLanes,
"substep tile size doesn't match writeback throughput")
@@ -490,6 +501,37 @@ class TensorCoreDecoupled(
}
}
// A buffer that collects multiple entries of input data and exposes the
// coalesced data as output. Effectively acts as a width-widening
// chisel.util.Pipe.
class FillBuffer[T <: Data](
gen: T,
entries: Int
) extends Module {
require(entries > 0, "FillBuffer must have a positive number of entries")
requireIsChiselType(gen)
val io = IO(new Bundle {
val enq = Flipped(Decoupled(gen))
val deq = Decoupled(Vec(entries, gen))
})
val data = Reg(Vec(entries, gen))
val ptr = Counter(entries + 1)
val full = (ptr.value === entries.U)
io.enq.ready := !full
when (io.enq.fire) {
data(ptr.value) := io.enq.bits
ptr.inc()
}
io.deq.valid := full
(io.deq.bits zip data).foreach { case (io, d) => io := d }
when (io.deq.fire) {
assert(ptr.value === entries.U, "FillBuffer fired before buffer was full")
ptr.reset()
}
}
// synthesizable unit tests
// wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy