diff --git a/src/main/scala/rocket/VortexCore.scala b/src/main/scala/rocket/VortexCore.scala index 75195f7..6e93633 100644 --- a/src/main/scala/rocket/VortexCore.scala +++ b/src/main/scala/rocket/VortexCore.scala @@ -47,6 +47,8 @@ class VortexBundle(tile: VortexTile)(implicit p: Parameters) extends CoreBundle val imemTagWidth = UUID_WIDTH + NW_WIDTH val LSUQ_TAG_BITS = 4 val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS + // dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH + val smemTagWidth = dmemTagWidth // conditionally instantiate ports depending on whether we want to use VX_cache or not val imem = if (!tile.vortexParams.useVxCache) Some(Vec(1, new Bundle { @@ -57,6 +59,10 @@ class VortexBundle(tile: VortexTile)(implicit p: Parameters) extends CoreBundle val a = Decoupled(new VortexBundleA(tagWidth = dmemTagWidth, dataWidth = 32)) val d = Flipped(Decoupled(new VortexBundleD(tagWidth = dmemTagWidth, dataWidth = 32))) })) else None + val smem = if (!tile.vortexParams.useVxCache) Some(Vec(tile.numLanes, new Bundle { + val a = Decoupled(new VortexBundleA(tagWidth = smemTagWidth, dataWidth = 32)) + val d = Flipped(Decoupled(new VortexBundleD(tagWidth = smemTagWidth, dataWidth = 32))) + })) else None val mem = if (tile.vortexParams.useVxCache) Some(new Bundle { val a = Decoupled(new VortexBundleA(tagWidth = 15, dataWidth = 128)) val d = Flipped(Decoupled(new VortexBundleD(tagWidth = 15, dataWidth = 128))) @@ -103,7 +109,6 @@ class Vortex(tile: VortexTile)(implicit p: Parameters) // addResource("/vsrc/vortex/hw/syn/synopsys/models/memory/cln28hpc/rf2_32x128_wm1/vsim/rf2_32x128_wm1_tb.v") // addResource("/vsrc/vortex/hw/syn/modelsim/vortex_tb.v") - addResource("/vsrc/vortex/hw/rtl/VX_gpu_pkg.sv") // addResource("/vsrc/vortex/hw/rtl/VX_cluster.sv") @@ -341,6 +346,5 @@ class Vortex(tile: VortexTile)(implicit p: Parameters) } val nTotalRoCCCSRs = 0 - val coreBundle = new VortexBundle(tile) - val io = IO(coreBundle) + val io = IO(new VortexBundle(tile)) } diff --git a/src/main/scala/tile/VortexTile.scala b/src/main/scala/tile/VortexTile.scala index e10e88e..03ad530 100644 --- a/src/main/scala/tile/VortexTile.scala +++ b/src/main/scala/tile/VortexTile.scala @@ -259,9 +259,8 @@ class VortexTile private ( // NOTE: We need TLWidthWidget here because there might be a data width // mismatch between Vortex's per-lane response and the system bus when we // don't instantiate either L1 or the coalescer. This _should_ be optimized - // out when we instantiate coalescer which should handle data width conversion - // internally (which it does by... using TLWidthWidget), but probably not - // the cleanest way to do this. + // out when we instantiate either which should handle data width conversion + // internally (which it does by... using TLWidthWidget). val dmemAggregateNode = TLIdentityNode() dmemNodes.foreach { dmemAggregateNode := TLWidthWidget(4) := _ } @@ -326,7 +325,8 @@ class VortexTile private ( // Instantiate sharedmem // TODO: parametrize - val sharedmem = LazyModule(new TLRAM(AddressSet(0xff000000L, 0x00ffffffL), beatBytes = 4 /*FIXME*/)) + // FIXME: beatBytes should be wordSize + val sharedmem = LazyModule(new TLRAM(AddressSet(0xff000000L, 0x00ffffffL), beatBytes = 4)) val smemXbar = LazyModule(new TLXbar) smemNodes.foreach(smemXbar.node := _) sharedmem.node :=* smemXbar.node @@ -492,95 +492,128 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) { outer.memNode.out(0)._1.a <> memTLAdapter.io.outReq memTLAdapter.io.outResp <> outer.memNode.out(0)._1.d } else { - val imemTLAdapter = Module( - new VortexTLAdapter( - outer.imemSourceWidth, - chiselTypeOf(core.io.imem.get(0).a.bits), - chiselTypeOf(core.io.imem.get(0).d.bits), - outer.imemNodes.head.out.head - ) - ) - // TODO: make imemNodes not a vector - imemTLAdapter.io.inReq <> core.io.imem.get(0).a - core.io.imem.get(0).d <> imemTLAdapter.io.inResp - outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq - imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d - - // @perf: this would duplicate SourceGenerator table for every lane and eat - // up some area - val dmemTLBundles = outer.dmemNodes.map(_.out.head._1) - val dmemTLAdapters = Seq.tabulate(outer.numLanes) { _ => - Module( + def connectImem = { + val imemTLAdapter = Module( new VortexTLAdapter( - outer.dmemSourceWidth, - chiselTypeOf(core.io.dmem.get(0).a.bits), - chiselTypeOf(core.io.dmem.get(0).d.bits), - outer.dmemNodes(0).out.head + outer.imemSourceWidth, + chiselTypeOf(core.io.imem.get(0).a.bits), + chiselTypeOf(core.io.imem.get(0).d.bits), + outer.imemNodes.head.out.head ) ) + // TODO: make imemNodes not a vector + imemTLAdapter.io.inReq <> core.io.imem.get(0).a + core.io.imem.get(0).d <> imemTLAdapter.io.inResp + outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq + imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d } - // Since the individual per-lane TL requests might come back out-of-sync between - // the lanes, but Vortex core expects the per-lane responses to be synced, - // we need to selectively fire responses that have the same source, and - // delay others. - // - // In order to do that, we pick a source from one of the valid lanes using e.g. - // an arbiter. Then using the chosen source id, we - // - lie to core that response is not valid if source doesn't match picked, and - // - lie to downstream that core is not ready if source doesn't match picked. - // - // Note that we cannot do this filtering logic using TileLink source ID, because - // we're allocating source for each lane independently. In that case, it's - // possible that lane 0's source matches lane 1/2/3's source by chance, - // even when they originated from different warps. Using Vortex's dcache req tag - // solves this issue because they use a UUID that is unique across all requests - // in the program. - // - // TODO: A cleaner solution would be to simply do a synchronized allocation - // of a same source id for all lanes. - val arb = Module( - new RRArbiter( - core.io.dmem.get.head.d.bits.source.cloneType, - outer.numLanes + def connectDmem = { + // @perf: this would duplicate SourceGenerator table for every lane and eat + // up some area + val dmemTLBundles = outer.dmemNodes.map(_.out.head._1) + val dmemTLAdapters = Seq.tabulate(outer.numLanes) { _ => + Module( + new VortexTLAdapter( + outer.dmemSourceWidth, + chiselTypeOf(core.io.dmem.get(0).a.bits), + chiselTypeOf(core.io.dmem.get(0).d.bits), + outer.dmemNodes(0).out.head + ) + ) + } + + // Since the individual per-lane TL requests might come back out-of-sync between + // the lanes, but Vortex core expects the per-lane responses to be synced, + // we need to selectively fire responses that have the same source, and + // delay others. + // + // In order to do that, we pick a source from one of the valid lanes using e.g. + // an arbiter. Then using the chosen source id, we + // - lie to core that response is not valid if source doesn't match picked, and + // - lie to downstream that core is not ready if source doesn't match picked. + // + // Note that we cannot do this filtering logic using TileLink source ID, because + // we're allocating source for each lane independently. In that case, it's + // possible that lane 0's source matches lane 1/2/3's source by chance, + // even when they originated from different warps. Using Vortex's dcache req tag + // solves this issue because they use a UUID that is unique across all requests + // in the program. + // + // TODO: A cleaner solution would be to simply do a synchronized allocation + // of a same source id for all lanes. + val arb = Module( + new RRArbiter( + core.io.dmem.get.head.d.bits.source.cloneType, + outer.numLanes + ) ) - ) - arb.io.out.ready := true.B - val dmemBundles = dmemTLAdapters.map(_.io.inResp) - (arb.io.in zip dmemBundles).foreach { case (arbIn, vxDmem) => - arbIn.valid := vxDmem.valid - arbIn.bits := vxDmem.bits.source - } - val matchingSources = Wire(UInt(outer.numLanes.W)) - matchingSources := dmemBundles - .map(b => - // If there is no valid response pending across all lanes, - // matchingSources should not filter out upstream ready signals, so - // set it to all-1 - !arb.io.out.valid || (b.bits.source === arb.io.out.bits) - ) - .asUInt + arb.io.out.ready := true.B + val dmemBundles = dmemTLAdapters.map(_.io.inResp) + (arb.io.in zip dmemBundles).foreach { case (arbIn, vxDmem) => + arbIn.valid := vxDmem.valid + arbIn.bits := vxDmem.bits.source + } + val matchingSources = Wire(UInt(outer.numLanes.W)) + matchingSources := dmemBundles + .map(b => + // If there is no valid response pending across all lanes, + // matchingSources should not filter out upstream ready signals, so + // set it to all-1 + !arb.io.out.valid || (b.bits.source === arb.io.out.bits) + ) + .asUInt - // make connection: - // VortexBundle <--> sourceId filter <--> VortexTLAdapter <--> dmemNodes - (core.io.dmem.get zip dmemTLAdapters) foreach { case (coreMem, tlAdapter) => - tlAdapter.io.inReq <> coreMem.a - coreMem.d <> tlAdapter.io.inResp - } - (core.io.dmem.get zip dmemTLAdapters).zipWithIndex.foreach { - case ((coreMem, tlAdapter), i) => - coreMem.d.valid := tlAdapter.io.inResp.valid && matchingSources(i) - tlAdapter.io.inResp.ready := coreMem.d.ready && matchingSources(i) - } - (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) => - tlOut.a <> tlAdapter.io.outReq - tlAdapter.io.outResp <> tlOut.d + // make connection: + // VortexBundle <--> sourceId filter <--> VortexTLAdapter <--> dmemNodes + (core.io.dmem.get zip dmemTLAdapters) foreach { case (coreMem, tlAdapter) => + tlAdapter.io.inReq <> coreMem.a + coreMem.d <> tlAdapter.io.inResp + } + // override response channel with matchingSources + (core.io.dmem.get zip dmemTLAdapters).zipWithIndex.foreach { + case ((coreMem, tlAdapter), i) => + coreMem.d.valid := tlAdapter.io.inResp.valid && matchingSources(i) + tlAdapter.io.inResp.ready := coreMem.d.ready && matchingSources(i) + } + (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) => + tlOut.a <> tlAdapter.io.outReq + tlAdapter.io.outResp <> tlOut.d + } + + outer.dmemAggregateNode.out.foreach { bo => + dontTouch(bo._1.a) + dontTouch(bo._1.d) + } } - outer.dmemAggregateNode.out.foreach { bo => - dontTouch(bo._1.a) - dontTouch(bo._1.d) + def connectSmem = { + // @perf: this would duplicate SourceGenerator table for every lane and eat + // up some area + val smemTLBundles = outer.smemNodes.map(_.out.head._1) + val smemTLAdapters = Seq.tabulate(outer.numLanes) { _ => + Module( + new VortexTLAdapter( + outer.smemSourceWidth, + chiselTypeOf(core.io.smem.get(0).a.bits), + chiselTypeOf(core.io.smem.get(0).d.bits), + outer.smemNodes(0).out.head + ) + ) + } + (core.io.smem.get zip smemTLAdapters) foreach { case (coreMem, tlAdapter) => + tlAdapter.io.inReq <> coreMem.a + coreMem.d <> tlAdapter.io.inResp + } + (smemTLAdapters zip smemTLBundles) foreach { case (tlAdapter, tlOut) => + tlOut.a <> tlAdapter.io.outReq + tlAdapter.io.outResp <> tlOut.d + } } + + connectImem + connectDmem + connectSmem } // TODO: generalize for useVxCache