4 Commits

Author SHA1 Message Date
Zhongdi LUO
007350fd5a feat: include vortex fexp RTL 2026-07-02 07:25:32 +00:00
Zhongdi LUO
47d6585896 Wire scalar TMEM through Radiance tile 2026-06-24 06:25:10 +00:00
Zhongdi LUO
f88085331e Save pre-TMEM-bank Radiance changes 2026-06-21 08:20:21 +00:00
Zhongdi LUO
1e78574113 Fix Blackwell SMEM fragment alignment 2026-05-27 08:43:36 +00:00
3 changed files with 254 additions and 73 deletions

View File

@@ -21,7 +21,7 @@ import midas.targetutils.SynthesizePrintf
import org.chipsalliance.cde.config._
import radiance.core._
import radiance.memory._
import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceSimArgs}
import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceSharedMemKey, RadianceSimArgs}
/** For determining radiance core id. This may be different from
* RadianceTileParams.tileId, when a cluster contains non-core tiles */
@@ -289,6 +289,11 @@ class RadianceTile private (
}
val tcSmemSize = numLsuLanes * 4
val tcSmemLineSize = p(RadianceSharedMemKey)
.map(k => k.numWords * k.wordSize)
.getOrElse(tcSmemSize)
val tcSmemClientMaxSize =
if (radianceParams.core.tensorCoreBlackwell) math.max(tcSmemSize, tcSmemLineSize) else tcSmemSize
val numTensorWarps = radianceParams.core.numTensorWarps
val numScalarWarps = numWarps - numTensorWarps
require(numTensorWarps > 0 && numTensorWarps < numWarps,
@@ -300,6 +305,8 @@ class RadianceTile private (
require(numLsuLanes == 4 || numLsuLanes == 8,
s"Wu Blackwell binding supports 4 or 8 lanes, got ${numLsuLanes}")
require(numTensorCores == numTensorWarps, "Wu Blackwell binding requires one Tensor Core per Tensor warp")
require(isPow2(tcSmemLineSize) && tcSmemLineSize >= tcSmemSize && (tcSmemLineSize % tcSmemSize) == 0,
s"Wu Blackwell SMEM line size (${tcSmemLineSize}) must be a power-of-two multiple of TC fragment size (${tcSmemSize})")
}
val tensorUsesAsyncMem = radianceParams.core.tensorCoreDecoupled || radianceParams.core.tensorCoreBlackwell
val tcSmemNodeCount = if (radianceParams.core.tensorCoreDecoupled) 2 else if (radianceParams.core.tensorCoreBlackwell) numTensorCores else 0
@@ -309,9 +316,9 @@ class RadianceTile private (
name = s"rad_tc_${radianceParams.coreId}_$i",
sourceId = IdRange(0, 1 << smemSourceWidth),
supports = TLSlaveToMasterTransferSizes(
probe = TransferSizes(1, tcSmemSize),
get = TransferSizes(1, tcSmemSize),
putFull = TransferSizes(1, tcSmemSize),
probe = TransferSizes(1, tcSmemClientMaxSize),
get = TransferSizes(1, tcSmemClientMaxSize),
putFull = TransferSizes(1, tcSmemClientMaxSize),
),
requestFifo = true
))
@@ -844,6 +851,9 @@ class RadianceTileModuleImp(outer: RadianceTile)
core.io.tc_tmem_C_rready := DontCare
core.io.tc_tmem_C_rdata := DontCare
core.io.tc_tmem_C_wready := DontCare
core.io.sc_tmem_rready := DontCare
core.io.sc_tmem_rdata := DontCare
core.io.sc_tmem_wready := DontCare
}
def connectTensorBlackwell = {
@@ -854,103 +864,232 @@ class RadianceTileModuleImp(outer: RadianceTile)
val nTC = outer.numTensorCores
val tcPorts = 3
val tcCoreDataBits = 32 * 8
val tcDataBits = outer.tcSmemSize * 8
val tcSmemLineBits = outer.tcSmemLineSize * 8
val tmemAddrBits = 9
val tmemDataBits = tcDataBits
val tmemMaskBits = outer.tcSmemSize
val tcTlSize = log2Ceil(outer.tcSmemSize)
val tcSmemLineTlSize = log2Ceil(outer.tcSmemLineSize)
def slice(u: UInt, width: Int, idx: Int): UInt = u(width * (idx + 1) - 1, width * idx)
def port(tc: Int, p: Int): Int = tc * tcPorts + p
def padToCoreData(u: UInt): UInt = {
if (u.getWidth == tcCoreDataBits) u else Cat(0.U((tcCoreDataBits - u.getWidth).W), u)
}
val tcAReady = Wire(Vec(nTC * tcPorts, Bool()))
val tcDValid = Wire(Vec(nTC * tcPorts, Bool()))
val tcDData = Wire(Vec(nTC * tcPorts, UInt(tcDataBits.W)))
val tcDData = Wire(Vec(nTC * tcPorts, UInt(tcCoreDataBits.W)))
val tcDTag = Wire(Vec(nTC * tcPorts, UInt(outer.tensorTagWidth.W)))
tcAReady.foreach(_ := false.B)
tcDValid.foreach(_ := false.B)
tcDData.foreach(_ := 0.U)
tcDTag.foreach(_ := 0.U)
// TMEM matrix: one shared 2R1W SRAM. read0 is operand A, read1 is C.
// TMEM matrix: four banked 2R1W SRAMs. Tensor A/C reads and scalar
// reads can proceed together when bank placement avoids conflicts.
// Each warp owns 2KB: A tile and C tile are 1KB each. The row count
// scales with the physical fragment width (16B for 4 lanes, 32B for 8).
val tmemBytesPerWarp = 2048
val tmemDepth = outer.numWarps * (tmemBytesPerWarp / outer.tcSmemSize)
val tmem = Module(new radiance.memory.TwoReadOneWriteSyncMem(
tmemDepth, UInt((outer.tcSmemSize * 8).W)))
val tmemBanks = 4
val tmemBankBits = log2Ceil(tmemBanks)
val tmemBankDepth = tmemDepth / tmemBanks
require(isPow2(tmemBanks))
require(tmemDepth % tmemBanks == 0)
val tmem = Seq.fill(tmemBanks) {
Module(new radiance.memory.TwoReadOneWriteSyncMem(
tmemBankDepth, UInt((outer.tcSmemSize * 8).W)))
}
val aReadArb = Module(new RRArbiter(UInt(tmemAddrBits.W), nTC))
val cReadArb = Module(new RRArbiter(UInt(tmemAddrBits.W), nTC))
class TmemReadReq extends Bundle {
val addr = UInt(tmemAddrBits.W)
val src = UInt(2.W)
val tc = UInt(log2Ceil(nTC max 2).W)
}
class TmemWriteReq extends Bundle {
val addr = UInt(tmemAddrBits.W)
val data = UInt(tmemDataBits.W)
val mask = UInt(tmemMaskBits.W)
}
val cWriteArb = Module(new RRArbiter(new TmemWriteReq, nTC))
(0 until nTC).foreach { tc =>
aReadArb.io.in(tc).valid := core.io.tc_tmem_A_ren(tc)
aReadArb.io.in(tc).bits := slice(core.io.tc_tmem_A_raddr, tmemAddrBits, tc)
cReadArb.io.in(tc).valid := core.io.tc_tmem_C_ren(tc)
cReadArb.io.in(tc).bits := slice(core.io.tc_tmem_C_raddr, tmemAddrBits, tc)
cWriteArb.io.in(tc).valid := core.io.tc_tmem_C_wen(tc)
cWriteArb.io.in(tc).bits.addr := slice(core.io.tc_tmem_C_waddr, tmemAddrBits, tc)
cWriteArb.io.in(tc).bits.data := slice(core.io.tc_tmem_C_wdata, tmemDataBits, tc)
cWriteArb.io.in(tc).bits.mask := slice(core.io.tc_tmem_C_mask, tmemMaskBits, tc)
val src = UInt(1.W)
val tc = UInt(log2Ceil(nTC max 2).W)
}
aReadArb.io.out.ready := true.B
cReadArb.io.out.ready := true.B
cWriteArb.io.out.ready := true.B
def bank(addr: UInt): UInt = addr(tmemBankBits - 1, 0)
def row(addr: UInt): UInt = addr(tmemAddrBits - 1, tmemBankBits)
tmem.io.ren0 := aReadArb.io.out.fire
tmem.io.raddr0 := aReadArb.io.out.bits
tmem.io.ren1 := cReadArb.io.out.fire
tmem.io.raddr1 := cReadArb.io.out.bits
tmem.io.wen := cWriteArb.io.out.fire
tmem.io.waddr := cWriteArb.io.out.bits.addr
tmem.io.wdata := cWriteArb.io.out.bits.data
tmem.io.mask := cWriteArb.io.out.bits.mask
val aReady = Wire(Vec(nTC, Bool()))
val cReady = Wire(Vec(nTC, Bool()))
val wReady = Wire(Vec(nTC, Bool()))
val scReadReady = Wire(Bool())
val scWriteReady = Wire(Bool())
aReady.foreach(_ := false.B)
cReady.foreach(_ := false.B)
wReady.foreach(_ := false.B)
scReadReady := false.B
scWriteReady := false.B
val aReadGrant = RegNext(Mux(aReadArb.io.out.fire, UIntToOH(aReadArb.io.chosen, nTC), 0.U(nTC.W)))
val cReadGrant = RegNext(Mux(cReadArb.io.out.fire, UIntToOH(cReadArb.io.chosen, nTC), 0.U(nTC.W)))
core.io.tc_tmem_A_rready := VecInit(aReadArb.io.in.map(_.fire)).asUInt
core.io.tc_tmem_C_rready := VecInit(cReadArb.io.in.map(_.fire)).asUInt
core.io.tc_tmem_C_wready := VecInit(cWriteArb.io.in.map(_.fire)).asUInt
val read0Grant = Wire(Vec(tmemBanks, new TmemReadReq))
val read1Grant = Wire(Vec(tmemBanks, new TmemReadReq))
val read0Valid = Wire(Vec(tmemBanks, Bool()))
val read1Valid = Wire(Vec(tmemBanks, Bool()))
val writeGrant = Wire(Vec(tmemBanks, new TmemWriteReq))
val writeValid = Wire(Vec(tmemBanks, Bool()))
read0Grant.foreach(_ := 0.U.asTypeOf(new TmemReadReq))
read1Grant.foreach(_ := 0.U.asTypeOf(new TmemReadReq))
read0Valid.foreach(_ := false.B)
read1Valid.foreach(_ := false.B)
writeGrant.foreach(_ := 0.U.asTypeOf(new TmemWriteReq))
writeValid.foreach(_ := false.B)
(0 until tmemBanks).foreach { b =>
val requests = (0 until nTC).flatMap { tc =>
val aAddr = slice(core.io.tc_tmem_A_raddr, tmemAddrBits, tc)
val cAddr = slice(core.io.tc_tmem_C_raddr, tmemAddrBits, tc)
Seq(
(core.io.tc_tmem_A_ren(tc).asBool && bank(aAddr) === b.U, aAddr, 0.U(2.W), tc.U),
(core.io.tc_tmem_C_ren(tc).asBool && bank(cAddr) === b.U, cAddr, 1.U(2.W), tc.U)
)
} ++ Seq(
(core.io.sc_tmem_ren.asBool && bank(core.io.sc_tmem_raddr) === b.U,
core.io.sc_tmem_raddr, 2.U(2.W), 0.U)
)
var used0 = false.B
var used1 = false.B
requests.foreach { case (valid, addr, src, tc) =>
val grant0 = valid && !used0
val grant1 = valid && used0 && !used1
when(grant0) {
read0Grant(b).addr := addr
read0Grant(b).src := src
read0Grant(b).tc := tc
}
when(grant1) {
read1Grant(b).addr := addr
read1Grant(b).src := src
read1Grant(b).tc := tc
}
used0 = used0 || grant0
used1 = used1 || grant1
when(grant0 || grant1) {
when(src === 0.U) { aReady(tc) := true.B }
when(src === 1.U) { cReady(tc) := true.B }
when(src === 2.U) { scReadReady := true.B }
}
}
read0Valid(b) := used0
read1Valid(b) := used1
var writeUsed = false.B
(0 until nTC).foreach { tc =>
val addr = slice(core.io.tc_tmem_C_waddr, tmemAddrBits, tc)
val valid = core.io.tc_tmem_C_wen(tc).asBool && bank(addr) === b.U
val grant = valid && !writeUsed
when(grant) {
writeValid(b) := true.B
writeGrant(b).addr := addr
writeGrant(b).data := slice(core.io.tc_tmem_C_wdata, tmemDataBits, tc)
writeGrant(b).mask := slice(core.io.tc_tmem_C_mask, tmemMaskBits, tc)
writeGrant(b).src := 0.U
writeGrant(b).tc := tc.U
wReady(tc) := true.B
}
writeUsed = writeUsed || grant
}
val scWValid = core.io.sc_tmem_wen.asBool && bank(core.io.sc_tmem_waddr) === b.U
val scWGrant = scWValid && !writeUsed
when(scWGrant) {
writeValid(b) := true.B
writeGrant(b).addr := core.io.sc_tmem_waddr
writeGrant(b).data := core.io.sc_tmem_wdata
writeGrant(b).mask := core.io.sc_tmem_mask
writeGrant(b).src := 1.U
writeGrant(b).tc := 0.U
scWriteReady := true.B
}
tmem(b).io.ren0 := read0Valid(b)
tmem(b).io.raddr0 := row(read0Grant(b).addr)
tmem(b).io.ren1 := read1Valid(b)
tmem(b).io.raddr1 := row(read1Grant(b).addr)
tmem(b).io.wen := writeValid(b)
tmem(b).io.waddr := row(writeGrant(b).addr)
tmem(b).io.wdata := writeGrant(b).data
tmem(b).io.mask := writeGrant(b).mask
}
val read0GrantReg = RegNext(read0Grant)
val read1GrantReg = RegNext(read1Grant)
val read0ValidReg = RegNext(read0Valid)
val read1ValidReg = RegNext(read1Valid)
core.io.tc_tmem_A_rready := aReady.asUInt
core.io.tc_tmem_C_rready := cReady.asUInt
core.io.tc_tmem_C_wready := wReady.asUInt
core.io.sc_tmem_rready := scReadReady.asUInt
core.io.sc_tmem_wready := scWriteReady.asUInt
core.io.tc_tmem_A_rdata := VecInit((0 until nTC).map { tc =>
Mux(aReadGrant(tc), tmem.io.rdata0, 0.U(tmemDataBits.W))
VecInit((0 until tmemBanks).map { b =>
Mux(read0ValidReg(b) && read0GrantReg(b).src === 0.U && read0GrantReg(b).tc === tc.U, tmem(b).io.rdata0,
Mux(read1ValidReg(b) && read1GrantReg(b).src === 0.U && read1GrantReg(b).tc === tc.U, tmem(b).io.rdata1, 0.U(tmemDataBits.W)))
}).reduce(_ | _)
}).asUInt
core.io.tc_tmem_C_rdata := VecInit((0 until nTC).map { tc =>
Mux(cReadGrant(tc), tmem.io.rdata1, 0.U(tmemDataBits.W))
VecInit((0 until tmemBanks).map { b =>
Mux(read0ValidReg(b) && read0GrantReg(b).src === 1.U && read0GrantReg(b).tc === tc.U, tmem(b).io.rdata0,
Mux(read1ValidReg(b) && read1GrantReg(b).src === 1.U && read1GrantReg(b).tc === tc.U, tmem(b).io.rdata1, 0.U(tmemDataBits.W)))
}).reduce(_ | _)
}).asUInt
core.io.sc_tmem_rdata := VecInit((0 until tmemBanks).map { b =>
Mux(read0ValidReg(b) && read0GrantReg(b).src === 2.U, tmem(b).io.rdata0,
Mux(read1ValidReg(b) && read1GrantReg(b).src === 2.U, tmem(b).io.rdata1, 0.U(tmemDataBits.W)))
}).reduce(_ | _)
// port 2: SMEM B, one TL client per tensor core. RadianceSharedMem arbitrates them.
(0 until nTC).foreach { tc =>
val p2 = port(tc, 2)
val client = outer.tcSmemNodes(tc).out.head
val rawAddress = slice(core.io.tc_a_bits_address, 32, p2)
val lineAddress = rawAddress & (~((outer.tcSmemLineSize - 1).U(32.W))).asUInt
val adapter = Module(new VortexTLAdapter(
outer.smemSourceWidth,
new VortexBundleA(tagWidth = outer.tensorTagWidth, dataWidth = tcDataBits),
new VortexBundleD(tagWidth = outer.tensorTagWidth, dataWidth = tcDataBits),
new VortexBundleA(tagWidth = outer.tensorTagWidth, dataWidth = tcSmemLineBits),
new VortexBundleD(tagWidth = outer.tensorTagWidth, dataWidth = tcSmemLineBits),
client
))
adapter.io.inReq.bits <> DontCare
adapter.io.inReq.valid := core.io.tc_a_valid(p2)
adapter.io.inReq.bits.address := slice(core.io.tc_a_bits_address, 32, p2)
adapter.io.inReq.bits.address := lineAddress
adapter.io.inReq.bits.source := slice(core.io.tc_a_bits_tag, outer.tensorTagWidth, p2)
adapter.io.inReq.bits.size := tcTlSize.U
adapter.io.inReq.bits.size := tcSmemLineTlSize.U
adapter.io.inReq.bits.opcode := Mux(core.io.tc_a_bits_write(p2).asBool, TLMessages.PutFullData, TLMessages.Get)
adapter.io.inReq.bits.mask := slice(core.io.tc_a_bits_mask, 32, p2)
adapter.io.inReq.bits.data := slice(core.io.tc_a_bits_data, tcDataBits, p2)
adapter.io.inReq.bits.mask := Fill(outer.tcSmemLineSize, 1.U(1.W))
adapter.io.inReq.bits.data := slice(core.io.tc_a_bits_data, tcCoreDataBits, p2)(tcSmemLineBits - 1, 0)
adapter.io.inResp.ready := core.io.tc_d_ready(p2)
client._1.a <> adapter.io.outReq
adapter.io.outResp <> client._1.d
val lineData = adapter.io.inResp.bits.data
val fragmentData = if (outer.tcSmemLineSize == outer.tcSmemSize) {
lineData
} else {
val fragmentsPerLine = outer.tcSmemLineSize / outer.tcSmemSize
val fragmentIndex = RegInit(0.U(log2Ceil(fragmentsPerLine).W))
val requestFragmentIndex = ((rawAddress & (outer.tcSmemLineSize - 1).U) >>
log2Ceil(outer.tcSmemSize)).asUInt
val lineFragments = lineData.asTypeOf(Vec(fragmentsPerLine, UInt(tcDataBits.W)))
when(adapter.io.inReq.fire) {
fragmentIndex := requestFragmentIndex
}
lineFragments(fragmentIndex)
}
tcAReady(p2) := adapter.io.inReq.ready
tcDValid(p2) := adapter.io.inResp.valid
tcDData(p2) := adapter.io.inResp.bits.data
tcDData(p2) := padToCoreData(fragmentData)
tcDTag(p2) := adapter.io.inResp.bits.source
}
@@ -970,15 +1109,15 @@ class RadianceTileModuleImp(outer: RadianceTile)
gmemAdapter.io.inReq.bits.source := slice(core.io.tc_a_bits_tag, outer.tensorTagWidth, p0)
gmemAdapter.io.inReq.bits.size := tcTlSize.U
gmemAdapter.io.inReq.bits.opcode := Mux(core.io.tc_a_bits_write(p0).asBool, TLMessages.PutFullData, TLMessages.Get)
gmemAdapter.io.inReq.bits.mask := slice(core.io.tc_a_bits_mask, 32, p0)
gmemAdapter.io.inReq.bits.data := slice(core.io.tc_a_bits_data, tcDataBits, p0)
gmemAdapter.io.inReq.bits.mask := slice(core.io.tc_a_bits_mask, 32, p0)(outer.tcSmemSize - 1, 0)
gmemAdapter.io.inReq.bits.data := slice(core.io.tc_a_bits_data, tcCoreDataBits, p0)(tcDataBits - 1, 0)
gmemAdapter.io.inResp.ready := core.io.tc_d_ready(p0)
gmemClient._1.a <> gmemAdapter.io.outReq
gmemAdapter.io.outResp <> gmemClient._1.d
tcAReady(p0) := gmemAdapter.io.inReq.ready
tcDValid(p0) := gmemAdapter.io.inResp.valid
tcDData(p0) := gmemAdapter.io.inResp.bits.data
tcDData(p0) := padToCoreData(gmemAdapter.io.inResp.bits.data)
tcDTag(p0) := gmemAdapter.io.inResp.bits.source
}
@@ -996,6 +1135,9 @@ class RadianceTileModuleImp(outer: RadianceTile)
core.io.tc_tmem_C_rready := DontCare
core.io.tc_tmem_C_rdata := DontCare
core.io.tc_tmem_C_wready := DontCare
core.io.sc_tmem_rready := DontCare
core.io.sc_tmem_rdata := DontCare
core.io.sc_tmem_wready := DontCare
}
}
@@ -1168,18 +1310,6 @@ class VortexTLAdapter(
val outResp = chiselTypeOf(outTL._1.d)
})
val (bundle, edge) = outTL
val sourceGen = Module(
new SourceGenerator(
newSourceWidth,
Some(inReqT.source),
ignoreInUse = false
)
)
sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created
sourceGen.io.reclaim.valid := io.outResp.fire
sourceGen.io.reclaim.bits := io.outResp.bits.source
sourceGen.io.meta := io.inReq.bits.source
// io passthrough logic
// TLBundleA <> VortexBundleA
io.outReq.valid := io.inReq.valid
@@ -1188,29 +1318,70 @@ class VortexTLAdapter(
io.outReq.bits.size := io.inReq.bits.size
io.outReq.bits.source := io.inReq.bits.source
io.outReq.bits.address := io.inReq.bits.address
// Get requires contiguous mask; only copy core's potentially-partial mask
// when writing
val outMaskWidth = io.outReq.bits.mask.getWidth
val inMaskWidth = io.inReq.bits.mask.getWidth
val outDataWidth = io.outReq.bits.data.getWidth
val inDataWidth = io.inReq.bits.data.getWidth
val byteOffset = io.inReq.bits.address(log2Ceil(outMaskWidth) - 1, 0)
val responseOffsetWidth = log2Ceil(outMaskWidth)
val responseSourceWidth = inReqT.source.getWidth
val sourceGen = Module(
new SourceGenerator(
newSourceWidth,
Some(UInt((responseSourceWidth + responseOffsetWidth).W)),
ignoreInUse = false
)
)
sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created
sourceGen.io.reclaim.valid := io.outResp.fire
sourceGen.io.reclaim.bits := io.outResp.bits.source
sourceGen.io.meta := Cat(byteOffset, io.inReq.bits.source)
val alignedMask = Wire(UInt(outMaskWidth.W))
val alignedData = Wire(UInt(outDataWidth.W))
if (outMaskWidth == inMaskWidth) {
alignedMask := io.inReq.bits.mask
} else {
val paddedMask = Wire(UInt(outMaskWidth.W))
paddedMask := io.inReq.bits.mask
alignedMask := (paddedMask << byteOffset)(outMaskWidth - 1, 0)
}
if (outDataWidth == inDataWidth) {
alignedData := io.inReq.bits.data
} else {
val paddedData = Wire(UInt(outDataWidth.W))
paddedData := io.inReq.bits.data
alignedData := (paddedData << (byteOffset << 3))(outDataWidth - 1, 0)
}
// PutFull requires the TL-canonical full mask for address+size; PutPartial
// can carry the core-provided byte mask.
io.outReq.bits.mask := Mux(
edge.hasData(io.outReq.bits),
io.inReq.bits.mask,
// generate TL-correct mask
io.outReq.bits.opcode === TLMessages.PutPartialData,
alignedMask,
edge.mask(io.inReq.bits.address, io.inReq.bits.size)
)
io.outReq.bits.data := io.inReq.bits.data
io.outReq.bits.data := alignedData
io.outReq.bits.corrupt := 0.U
io.inReq.ready := io.outReq.ready
// VortexBundleD <> TLBundleD
io.inResp.valid := io.outResp.valid
io.inResp.bits.opcode := io.outResp.bits.opcode
io.inResp.bits.size := io.outResp.bits.size
io.inResp.bits.source := io.outResp.bits.source
io.inResp.bits.data := io.outResp.bits.data
val responseMeta = sourceGen.io.peek.asUInt
val responseSource = responseMeta(responseSourceWidth - 1, 0)
val responseByteOffset =
responseMeta(responseSourceWidth + responseOffsetWidth - 1, responseSourceWidth)
io.inResp.bits.source := responseSource
if (outDataWidth == inDataWidth) {
io.inResp.bits.data := io.outResp.bits.data
} else {
io.inResp.bits.data := (io.outResp.bits.data >> (responseByteOffset << 3))(inDataWidth - 1, 0)
}
io.outResp.ready := io.inResp.ready
// "man-in-the-middle"
io.inReq.ready := io.outReq.ready && sourceGen.io.id.valid
io.outReq.valid := io.inReq.valid && sourceGen.io.id.valid
io.outReq.bits.source := sourceGen.io.id.bits
// translate upstream response back to its old sourceId
io.inResp.bits.source := sourceGen.io.peek
}

View File

@@ -120,6 +120,15 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl
val tc_tmem_C_waddr = Output(UInt((numTensorCores * 9).W))
val tc_tmem_C_wdata = Output(UInt((numTensorCores * numLanes * 32).W))
val tc_tmem_C_mask = Output(UInt((numTensorCores * numLanes * 4).W))
val sc_tmem_ren = Output(UInt(1.W))
val sc_tmem_rready = Input(UInt(1.W))
val sc_tmem_raddr = Output(UInt(9.W))
val sc_tmem_rdata = Input(UInt((numLanes * 32).W))
val sc_tmem_wen = Output(UInt(1.W))
val sc_tmem_wready = Input(UInt(1.W))
val sc_tmem_waddr = Output(UInt(9.W))
val sc_tmem_wdata = Output(UInt((numLanes * 32).W))
val sc_tmem_mask = Output(UInt((numLanes * 4).W))
// FIXME: hardcoded
val barrierIdBits = tile.barrierMasterNode.out(0)._2.barrierIdBits
@@ -351,6 +360,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_div.sv")
addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_dpi.sv")
addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_dsp.sv")
addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_exp.sv")
addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_fma.sv")
addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_ncomp.sv")
addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_rounding.sv")