Connect smem core IO to TL with translation

2024-01-01 02:24:57 -08:00
parent 15c3c55cb6
commit 95e05f5457
2 changed files with 123 additions and 86 deletions
--- a/src/main/scala/rocket/VortexCore.scala
+++ b/src/main/scala/rocket/VortexCore.scala
@@ -47,6 +47,8 @@ class VortexBundle(tile: VortexTile)(implicit p: Parameters) extends CoreBundle
  val imemTagWidth = UUID_WIDTH + NW_WIDTH
  val LSUQ_TAG_BITS = 4
  val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS
  // dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH
  val smemTagWidth = dmemTagWidth
  // conditionally instantiate ports depending on whether we want to use VX_cache or not
  val imem = if (!tile.vortexParams.useVxCache) Some(Vec(1, new Bundle {
@@ -57,6 +59,10 @@ class VortexBundle(tile: VortexTile)(implicit p: Parameters) extends CoreBundle
    val a = Decoupled(new VortexBundleA(tagWidth = dmemTagWidth, dataWidth = 32))
    val d = Flipped(Decoupled(new VortexBundleD(tagWidth = dmemTagWidth, dataWidth = 32)))
  })) else None
  val smem = if (!tile.vortexParams.useVxCache) Some(Vec(tile.numLanes, new Bundle {
    val a = Decoupled(new VortexBundleA(tagWidth = smemTagWidth, dataWidth = 32))
    val d = Flipped(Decoupled(new VortexBundleD(tagWidth = smemTagWidth, dataWidth = 32)))
  })) else None
  val mem = if (tile.vortexParams.useVxCache) Some(new Bundle { 
    val a = Decoupled(new VortexBundleA(tagWidth = 15, dataWidth = 128))
    val d = Flipped(Decoupled(new VortexBundleD(tagWidth = 15, dataWidth = 128)))
@@ -103,7 +109,6 @@ class Vortex(tile: VortexTile)(implicit p: Parameters)
  // addResource("/vsrc/vortex/hw/syn/synopsys/models/memory/cln28hpc/rf2_32x128_wm1/vsim/rf2_32x128_wm1_tb.v")
  // addResource("/vsrc/vortex/hw/syn/modelsim/vortex_tb.v")
  addResource("/vsrc/vortex/hw/rtl/VX_gpu_pkg.sv")
  // addResource("/vsrc/vortex/hw/rtl/VX_cluster.sv")
@@ -341,6 +346,5 @@ class Vortex(tile: VortexTile)(implicit p: Parameters)
  }
  val nTotalRoCCCSRs = 0
-  val coreBundle = new VortexBundle(tile)
+  val io = IO(new VortexBundle(tile))
  val io = IO(coreBundle)
 }
--- a/src/main/scala/tile/VortexTile.scala
+++ b/src/main/scala/tile/VortexTile.scala
@@ -259,9 +259,8 @@ class VortexTile private (
  // NOTE: We need TLWidthWidget here because there might be a data width
  // mismatch between Vortex's per-lane response and the system bus when we
  // don't instantiate either L1 or the coalescer.  This _should_ be optimized
-  // out when we instantiate coalescer which should handle data width conversion
+  // out when we instantiate either which should handle data width conversion
-  // internally (which it does by... using TLWidthWidget), but probably not
+  // internally (which it does by... using TLWidthWidget).
  // the cleanest way to do this.
  val dmemAggregateNode = TLIdentityNode()
  dmemNodes.foreach { dmemAggregateNode := TLWidthWidget(4) := _ }
@@ -326,7 +325,8 @@ class VortexTile private (
  // Instantiate sharedmem
  // TODO: parametrize
-  val sharedmem = LazyModule(new TLRAM(AddressSet(0xff000000L, 0x00ffffffL), beatBytes = 4 /*FIXME*/))
+  // FIXME: beatBytes should be wordSize
  val sharedmem = LazyModule(new TLRAM(AddressSet(0xff000000L, 0x00ffffffL), beatBytes = 4))
  val smemXbar = LazyModule(new TLXbar)
  smemNodes.foreach(smemXbar.node := _)
  sharedmem.node :=* smemXbar.node
@@ -492,95 +492,128 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
    outer.memNode.out(0)._1.a <> memTLAdapter.io.outReq
    memTLAdapter.io.outResp <> outer.memNode.out(0)._1.d
  } else {
-    val imemTLAdapter = Module(
+    def connectImem = {
-      new VortexTLAdapter(
+      val imemTLAdapter = Module(
        outer.imemSourceWidth,
        chiselTypeOf(core.io.imem.get(0).a.bits),
        chiselTypeOf(core.io.imem.get(0).d.bits),
        outer.imemNodes.head.out.head
      )
    )
    // TODO: make imemNodes not a vector
    imemTLAdapter.io.inReq <> core.io.imem.get(0).a
    core.io.imem.get(0).d <> imemTLAdapter.io.inResp
    outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq
    imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d
    // @perf: this would duplicate SourceGenerator table for every lane and eat
    // up some area
    val dmemTLBundles = outer.dmemNodes.map(_.out.head._1)
    val dmemTLAdapters = Seq.tabulate(outer.numLanes) { _ =>
      Module(
        new VortexTLAdapter(
-          outer.dmemSourceWidth,
+          outer.imemSourceWidth,
-          chiselTypeOf(core.io.dmem.get(0).a.bits),
+          chiselTypeOf(core.io.imem.get(0).a.bits),
-          chiselTypeOf(core.io.dmem.get(0).d.bits),
+          chiselTypeOf(core.io.imem.get(0).d.bits),
-          outer.dmemNodes(0).out.head
+          outer.imemNodes.head.out.head
        )
      )
      // TODO: make imemNodes not a vector
      imemTLAdapter.io.inReq <> core.io.imem.get(0).a
      core.io.imem.get(0).d <> imemTLAdapter.io.inResp
      outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq
      imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d
    }
-    // Since the individual per-lane TL requests might come back out-of-sync between
+    def connectDmem = {
-    // the lanes, but Vortex core expects the per-lane responses to be synced,
+      // @perf: this would duplicate SourceGenerator table for every lane and eat
-    // we need to selectively fire responses that have the same source, and
+      // up some area
-    // delay others.
+      val dmemTLBundles = outer.dmemNodes.map(_.out.head._1)
-    //
+      val dmemTLAdapters = Seq.tabulate(outer.numLanes) { _ =>
-    // In order to do that, we pick a source from one of the valid lanes using e.g.
+        Module(
-    // an arbiter.  Then using the chosen source id, we
+          new VortexTLAdapter(
-    // - lie to core that response is not valid if source doesn't match picked, and
+            outer.dmemSourceWidth,
-    // - lie to downstream that core is not ready if source doesn't match picked.
+            chiselTypeOf(core.io.dmem.get(0).a.bits),
-    //
+            chiselTypeOf(core.io.dmem.get(0).d.bits),
-    // Note that we cannot do this filtering logic using TileLink source ID, because
+            outer.dmemNodes(0).out.head
-    // we're allocating source for each lane independently.  In that case, it's
+          )
-    // possible that lane 0's source matches lane 1/2/3's source by chance,
+        )
-    // even when they originated from different warps.  Using Vortex's dcache req tag
+      }
-    // solves this issue because they use a UUID that is unique across all requests
+
-    // in the program.
+      // Since the individual per-lane TL requests might come back out-of-sync between
-    //
+      // the lanes, but Vortex core expects the per-lane responses to be synced,
-    // TODO: A cleaner solution would be to simply do a synchronized allocation
+      // we need to selectively fire responses that have the same source, and
-    // of a same source id for all lanes.
+      // delay others.
-    val arb = Module(
+      //
-      new RRArbiter(
+      // In order to do that, we pick a source from one of the valid lanes using e.g.
-        core.io.dmem.get.head.d.bits.source.cloneType,
+      // an arbiter.  Then using the chosen source id, we
-        outer.numLanes
+      // - lie to core that response is not valid if source doesn't match picked, and
      // - lie to downstream that core is not ready if source doesn't match picked.
      //
      // Note that we cannot do this filtering logic using TileLink source ID, because
      // we're allocating source for each lane independently.  In that case, it's
      // possible that lane 0's source matches lane 1/2/3's source by chance,
      // even when they originated from different warps.  Using Vortex's dcache req tag
      // solves this issue because they use a UUID that is unique across all requests
      // in the program.
      //
      // TODO: A cleaner solution would be to simply do a synchronized allocation
      // of a same source id for all lanes.
      val arb = Module(
        new RRArbiter(
          core.io.dmem.get.head.d.bits.source.cloneType,
          outer.numLanes
        )
      )
-    )
+      arb.io.out.ready := true.B
-    arb.io.out.ready := true.B
+      val dmemBundles = dmemTLAdapters.map(_.io.inResp)
-    val dmemBundles = dmemTLAdapters.map(_.io.inResp)
+      (arb.io.in zip dmemBundles).foreach { case (arbIn, vxDmem) =>
-    (arb.io.in zip dmemBundles).foreach { case (arbIn, vxDmem) =>
+        arbIn.valid := vxDmem.valid
-      arbIn.valid := vxDmem.valid
+        arbIn.bits := vxDmem.bits.source
-      arbIn.bits := vxDmem.bits.source
+      }
-    }
+      val matchingSources = Wire(UInt(outer.numLanes.W))
-    val matchingSources = Wire(UInt(outer.numLanes.W))
+      matchingSources := dmemBundles
-    matchingSources := dmemBundles
+        .map(b =>
-      .map(b =>
+            // If there is no valid response pending across all lanes,
-        // If there is no valid response pending across all lanes,
+            // matchingSources should not filter out upstream ready signals, so
-        // matchingSources should not filter out upstream ready signals, so
+            // set it to all-1
-        // set it to all-1
+            !arb.io.out.valid || (b.bits.source === arb.io.out.bits)
-        !arb.io.out.valid || (b.bits.source === arb.io.out.bits)
+            )
-      )
+        .asUInt
      .asUInt
-    // make connection:
+      // make connection:
-    // VortexBundle <--> sourceId filter <--> VortexTLAdapter <--> dmemNodes
+      // VortexBundle <--> sourceId filter <--> VortexTLAdapter <--> dmemNodes
-    (core.io.dmem.get zip dmemTLAdapters) foreach { case (coreMem, tlAdapter) =>
+      (core.io.dmem.get zip dmemTLAdapters) foreach { case (coreMem, tlAdapter) =>
-      tlAdapter.io.inReq <> coreMem.a
+        tlAdapter.io.inReq <> coreMem.a
-      coreMem.d <> tlAdapter.io.inResp
+        coreMem.d <> tlAdapter.io.inResp
-    }
+      }
-    (core.io.dmem.get zip dmemTLAdapters).zipWithIndex.foreach {
+      // override response channel with matchingSources
-      case ((coreMem, tlAdapter), i) =>
+      (core.io.dmem.get zip dmemTLAdapters).zipWithIndex.foreach {
-        coreMem.d.valid := tlAdapter.io.inResp.valid && matchingSources(i)
+        case ((coreMem, tlAdapter), i) =>
-        tlAdapter.io.inResp.ready := coreMem.d.ready && matchingSources(i)
+          coreMem.d.valid := tlAdapter.io.inResp.valid && matchingSources(i)
-    }
+          tlAdapter.io.inResp.ready := coreMem.d.ready && matchingSources(i)
-    (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) =>
+      }
-      tlOut.a <> tlAdapter.io.outReq
+      (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) =>
-      tlAdapter.io.outResp <> tlOut.d
+        tlOut.a <> tlAdapter.io.outReq
        tlAdapter.io.outResp <> tlOut.d
      }
      outer.dmemAggregateNode.out.foreach { bo =>
        dontTouch(bo._1.a)
        dontTouch(bo._1.d)
      }
    }
-    outer.dmemAggregateNode.out.foreach { bo =>
+    def connectSmem = {
-      dontTouch(bo._1.a)
+      // @perf: this would duplicate SourceGenerator table for every lane and eat
-      dontTouch(bo._1.d)
+      // up some area
      val smemTLBundles = outer.smemNodes.map(_.out.head._1)
      val smemTLAdapters = Seq.tabulate(outer.numLanes) { _ =>
        Module(
          new VortexTLAdapter(
            outer.smemSourceWidth,
            chiselTypeOf(core.io.smem.get(0).a.bits),
            chiselTypeOf(core.io.smem.get(0).d.bits),
            outer.smemNodes(0).out.head
          )
        )
      }
      (core.io.smem.get zip smemTLAdapters) foreach { case (coreMem, tlAdapter) =>
        tlAdapter.io.inReq <> coreMem.a
        coreMem.d <> tlAdapter.io.inResp
      }
      (smemTLAdapters zip smemTLBundles) foreach { case (tlAdapter, tlOut) =>
        tlOut.a <> tlAdapter.io.outReq
        tlAdapter.io.outResp <> tlOut.d
      }
    }
    connectImem
    connectDmem
    connectSmem
  }
  // TODO: generalize for useVxCache