Distinguish LSU lanes from SIMD lanes and elaborate tag width logic

This commit is contained in:
Hansung Kim
2024-01-16 22:20:16 -08:00
parent 263f00baed
commit 132742ea88
2 changed files with 46 additions and 44 deletions

View File

@@ -45,11 +45,11 @@ class VortexBundle(tile: VortexTile)(implicit p: Parameters) extends CoreBundle
val a = Decoupled(new VortexBundleA(tagWidth = tile.imemTagWidth, dataWidth = 32))
val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.imemTagWidth, dataWidth = 32)))
})) else None
val dmem = if (!tile.vortexParams.useVxCache) Some(Vec(tile.numLanes, new Bundle {
val dmem = if (!tile.vortexParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle {
// val a = Decoupled(new VortexBundleA(tagWidth = tile.dmemTagWidth, dataWidth = 32))
// val d = Flipped(Decoupled(new VortexBundleD(tagWidth = dmemTagWidth, dataWidth = 32)))
})) else None
val smem = if (!tile.vortexParams.useVxCache) Some(Vec(tile.numLanes, new Bundle {
val smem = if (!tile.vortexParams.useVxCache) Some(Vec(tile.numLsuLanes, new Bundle {
// val a = Decoupled(new VortexBundleA(tagWidth = tile.smemTagWidth, dataWidth = 32))
// val d = Flipped(Decoupled(new VortexBundleD(tagWidth = tile.smemTagWidth, dataWidth = 32)))
})) else None
@@ -61,40 +61,40 @@ class VortexBundle(tile: VortexTile)(implicit p: Parameters) extends CoreBundle
}) else None
// Chisel doesn't support 2-D array in BlackBox interface to Verilog, so
// everything needs to be 1-D flattened UInt with their widths configurable by numLanes.
// everything needs to be 1-D flattened UInt with their widths configurable by numLSULanes.
//
// FIXME: hardcoded bitwidths
val dmem_a_ready = Input(UInt((tile.numLanes * 1).W))
val dmem_a_valid = Output(UInt((tile.numLanes * 1).W))
val dmem_a_bits_opcode = Output(UInt((tile.numLanes * 3).W))
val dmem_a_bits_size = Output(UInt((tile.numLanes * 4).W))
val dmem_a_bits_source = Output(UInt((tile.numLanes * tile.dmemTagWidth).W))
val dmem_a_bits_address = Output(UInt((tile.numLanes * 32).W))
val dmem_a_bits_mask = Output(UInt((tile.numLanes * 4).W))
val dmem_a_bits_data = Output(UInt((tile.numLanes * 32).W))
val dmem_a_ready = Input(UInt((tile.numLsuLanes * 1).W))
val dmem_a_valid = Output(UInt((tile.numLsuLanes * 1).W))
val dmem_a_bits_opcode = Output(UInt((tile.numLsuLanes * 3).W))
val dmem_a_bits_size = Output(UInt((tile.numLsuLanes * 4).W))
val dmem_a_bits_source = Output(UInt((tile.numLsuLanes * tile.dmemTagWidth).W))
val dmem_a_bits_address = Output(UInt((tile.numLsuLanes * 32).W))
val dmem_a_bits_mask = Output(UInt((tile.numLsuLanes * 4).W))
val dmem_a_bits_data = Output(UInt((tile.numLsuLanes * 32).W))
val dmem_d_valid = Input(UInt((tile.numLanes * 1).W))
val dmem_d_bits_opcode = Input(UInt((tile.numLanes * 3).W))
val dmem_d_bits_size = Input(UInt((tile.numLanes * 4).W))
val dmem_d_bits_source = Input(UInt((tile.numLanes * tile.dmemTagWidth).W))
val dmem_d_bits_data = Input(UInt((tile.numLanes * 32).W))
val dmem_d_ready = Output(UInt((tile.numLanes * 1).W))
val dmem_d_valid = Input(UInt((tile.numLsuLanes * 1).W))
val dmem_d_bits_opcode = Input(UInt((tile.numLsuLanes * 3).W))
val dmem_d_bits_size = Input(UInt((tile.numLsuLanes * 4).W))
val dmem_d_bits_source = Input(UInt((tile.numLsuLanes * tile.dmemTagWidth).W))
val dmem_d_bits_data = Input(UInt((tile.numLsuLanes * 32).W))
val dmem_d_ready = Output(UInt((tile.numLsuLanes * 1).W))
val smem_a_ready = Input(UInt((tile.numLanes * 1).W))
val smem_a_valid = Output(UInt((tile.numLanes * 1).W))
val smem_a_bits_opcode = Output(UInt((tile.numLanes * 3).W))
val smem_a_bits_size = Output(UInt((tile.numLanes * 4).W))
val smem_a_bits_source = Output(UInt((tile.numLanes * tile.smemTagWidth).W))
val smem_a_bits_address = Output(UInt((tile.numLanes * 32).W))
val smem_a_bits_mask = Output(UInt((tile.numLanes * 4).W))
val smem_a_bits_data = Output(UInt((tile.numLanes * 32).W))
val smem_a_ready = Input(UInt((tile.numLsuLanes * 1).W))
val smem_a_valid = Output(UInt((tile.numLsuLanes * 1).W))
val smem_a_bits_opcode = Output(UInt((tile.numLsuLanes * 3).W))
val smem_a_bits_size = Output(UInt((tile.numLsuLanes * 4).W))
val smem_a_bits_source = Output(UInt((tile.numLsuLanes * tile.smemTagWidth).W))
val smem_a_bits_address = Output(UInt((tile.numLsuLanes * 32).W))
val smem_a_bits_mask = Output(UInt((tile.numLsuLanes * 4).W))
val smem_a_bits_data = Output(UInt((tile.numLsuLanes * 32).W))
val smem_d_valid = Input(UInt((tile.numLanes * 1).W))
val smem_d_bits_opcode = Input(UInt((tile.numLanes * 3).W))
val smem_d_bits_size = Input(UInt((tile.numLanes * 4).W))
val smem_d_bits_source = Input(UInt((tile.numLanes * tile.smemTagWidth).W))
val smem_d_bits_data = Input(UInt((tile.numLanes * 32).W))
val smem_d_ready = Output(UInt((tile.numLanes * 1).W))
val smem_d_valid = Input(UInt((tile.numLsuLanes * 1).W))
val smem_d_bits_opcode = Input(UInt((tile.numLsuLanes * 3).W))
val smem_d_bits_size = Input(UInt((tile.numLsuLanes * 4).W))
val smem_d_bits_source = Input(UInt((tile.numLsuLanes * tile.smemTagWidth).W))
val smem_d_bits_data = Input(UInt((tile.numLsuLanes * 32).W))
val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W))
// val fpu = Flipped(new FPUCoreIO())
//val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs))
@@ -114,7 +114,7 @@ class Vortex(tile: VortexTile)(implicit p: Parameters)
"CORE_ID" -> tile.tileParams.hartId,
// TODO: can we get this as a parameter?
"BOOTROM_HANG100" -> 0x10100,
"NUM_THREADS" -> tile.numLanes
"NUM_THREADS" -> tile.numLsuLanes
)
)
with HasBlackBoxResource {

View File

@@ -189,12 +189,14 @@ class VortexTile private (
val smemSourceWidth = 4 // FIXME: hardcoded
// TODO: parametrize
val numWarps = 4
val numWarps = 4 // TODO: parametrize
val NW_WIDTH = (if (numWarps == 1) 1 else log2Ceil(numWarps))
val UUID_WIDTH = 44
val imemTagWidth = UUID_WIDTH + NW_WIDTH
val LSUQ_TAG_BITS = 4
val numLsuLanes = 4
// see VX_gpu_pkg.sv
val LSUQ_SIZE = 8 * (numLanes / numLsuLanes)
val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/
val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS
// dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH
val smemTagWidth = dmemTagWidth
@@ -218,7 +220,7 @@ class VortexTile private (
)
}
val dmemNodes = Seq.tabulate(numLanes) { i =>
val dmemNodes = Seq.tabulate(numLsuLanes) { i =>
TLClientNode(
Seq(
TLMasterPortParameters.v1(
@@ -241,7 +243,7 @@ class VortexTile private (
)
}
val smemNodes = Seq.tabulate(numLanes) { i =>
val smemNodes = Seq.tabulate(numLsuLanes) { i =>
TLClientNode(
Seq(
TLMasterPortParameters.v1(
@@ -337,12 +339,12 @@ class VortexTile private (
//
// Instantiate the same number of banks as there are lanes.
// TODO: parametrize
val smemBanks = Seq.tabulate(numLanes) { bankId =>
val smemBanks = Seq.tabulate(numLsuLanes) { bankId =>
// Banked-by-word (4 bytes)
// base for bank 1: ff...000000|01|00
// mask for bank 1; 00...111111|00|11
val base = 0xff000000L | (bankId * 4 /*wordSize*/ )
val mask = 0x00ffffffL ^ ((numLanes - 1) * 4 /*wordSize*/ )
val mask = 0x00ffffffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ )
LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = 4 /*wordSize*/ ))
}
// smem lanes-to-banks crossbar
@@ -531,7 +533,7 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
// @perf: this would duplicate SourceGenerator table for every lane and eat
// up some area
val dmemTLBundles = outer.dmemNodes.map(_.out.head._1)
val dmemTLAdapters = Seq.tabulate(outer.numLanes) { _ =>
val dmemTLAdapters = Seq.tabulate(outer.numLsuLanes) { _ =>
Module(
new VortexTLAdapter(
outer.dmemSourceWidth,
@@ -565,7 +567,7 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
new RRArbiter(
// FIXME: should really be source on D channel
new VortexBundleA(tagWidth = outer.dmemTagWidth, dataWidth = 32).source.cloneType,
outer.numLanes
outer.numLsuLanes
)
)
arb.io.out.ready := true.B
@@ -574,7 +576,7 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
arbIn.valid := vxDmem.valid
arbIn.bits := vxDmem.bits.source
}
val matchingSources = Wire(UInt(outer.numLanes.W))
val matchingSources = Wire(UInt(outer.numLsuLanes.W))
matchingSources := dmemBundles
.map(b =>
// If there is no valid response pending across all lanes,
@@ -609,7 +611,7 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
core.io.dmem_d_bits_data := dmemTLAdapters.map(_.io.inResp.bits.data).asUInt
// override response channel with matchingSources
val dmem_d_valid_vec = Wire(Vec(outer.numLanes, Bool()))
val dmem_d_valid_vec = Wire(Vec(outer.numLsuLanes, Bool()))
dmemTLAdapters.zipWithIndex.foreach {
case (tlAdapter, i) =>
dmem_d_valid_vec(i) := tlAdapter.io.inResp.valid && matchingSources(i)
@@ -632,7 +634,7 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
// @perf: this would duplicate SourceGenerator table for every lane and eat
// up some area
val smemTLBundles = outer.smemNodes.map(_.out.head._1)
val smemTLAdapters = Seq.tabulate(outer.numLanes) { _ =>
val smemTLAdapters = Seq.tabulate(outer.numLsuLanes) { _ =>
Module(
new VortexTLAdapter(
outer.smemSourceWidth,