radiance/src/main/scala/radiance/tile/VortexTile.scala

// See LICENSE.SiFive for license details.
// See LICENSE.Berkeley for license details.

package radiance.tile

import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config._
import freechips.rocketchip.devices.tilelink._
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.interrupts._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.rocket._
import freechips.rocketchip.subsystem.HierarchicalElementCrossingParamsLike
import freechips.rocketchip.util._
import freechips.rocketchip.prci.ClockSinkParameters
import freechips.rocketchip.regmapper.RegField
import freechips.rocketchip.tile._
import radiance.memory._
import gemmini.{Gemmini, GemminiCustomConfigs}

case class VortexTileParams(
    core: VortexCoreParams = VortexCoreParams(),
    useVxCache: Boolean = false,
    icache: Option[ICacheParams] = None /* Some(ICacheParams()) */,
    dcache: Option[DCacheParams] = None /* Some(DCacheParams()) */,
    btb: Option[BTBParams] = None, // Some(BTBParams()),
    dataScratchpadBytes: Int = 0,
    name: Option[String] = Some("vortex_tile"),
    tileId: Int = 0,
    beuAddr: Option[BigInt] = None,
    blockerCtrlAddr: Option[BigInt] = None,
    clockSinkParams: ClockSinkParameters = ClockSinkParameters(),
    boundaryBuffers: Option[RocketTileBoundaryBufferParams] = None
) extends InstantiableTileParams[VortexTile] {
  // TODO: want to use ICache/DCacheParams as well
  // require(icache.isDefined)
  // require(dcache.isDefined)

  def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByHartIdImpl)(
      implicit p: Parameters
  ): VortexTile = {
    new VortexTile(this, crossing, lookup)
  }
  val baseName = name.getOrElse("radiance_tile")
  val uniqueName = s"${baseName}_$tileId"
}

// TODO: move to VortexCore
// VortexTileParams extends TileParams which require a `core: CoreParams`
// field, so VortexCoreParams needs to extend from CoreParams as well,
// requiring all these fields to be initialized.  Most of this is unnecessary
// though. TODO see how BOOM does that
case class VortexCoreParams(
  bootFreqHz: BigInt = 0,
  useVM: Boolean = true,
  useUser: Boolean = false,
  useSupervisor: Boolean = false,
  useHypervisor: Boolean = false,
  useDebug: Boolean = true,
  useAtomics: Boolean = true,
  useAtomicsOnlyForIO: Boolean = false,
  useCompressed: Boolean = true,
  useRVE: Boolean = false,
  useConditionalZero: Boolean = false,
  nLocalInterrupts: Int = 0,
  useNMI: Boolean = false,
  nBreakpoints: Int = 1,
  useBPWatch: Boolean = false,
  mcontextWidth: Int = 0,
  scontextWidth: Int = 0,
  nPMPs: Int = 8,
  nPerfCounters: Int = 0,
  haveBasicCounters: Boolean = true,
  haveCFlush: Boolean = false,
  misaWritable: Boolean = true,
  nL2TLBEntries: Int = 0,
  nL2TLBWays: Int = 1,
  nPTECacheEntries: Int = 8,
  mtvecInit: Option[BigInt] = Some(BigInt(0)),
  mtvecWritable: Boolean = true,
  fastLoadWord: Boolean = true,
  fastLoadByte: Boolean = false,
  branchPredictionModeCSR: Boolean = false,
  clockGate: Boolean = false,
  mvendorid: Int = 0, // 0 means non-commercial implementation
  mimpid: Int = 0x20181004, // release date in BCD
  mulDiv: Option[MulDivParams] = Some(MulDivParams()),
  fpu: Option[FPUParams] = Some(FPUParams()),
  debugROB: Boolean = false, // if enabled, uses a C++ debug ROB to generate trace-with-wdata
  haveCease: Boolean = true, // non-standard CEASE instruction
  haveSimTimeout: Boolean = true // add plusarg for simulation timeout
) extends CoreParams {
  val haveFSDirty = false
  val pmpGranularity: Int = if (useHypervisor) 4096 else 4
  val fetchWidth: Int = if (useCompressed) 2 else 1
  val decodeWidth: Int = fetchWidth / (if (useCompressed) 2 else 1)
  val retireWidth: Int = 1
  val instBits: Int = if (useCompressed) 16 else 32
  val lrscCycles: Int = 80 // worst case is 14 mispredicted branches + slop
  val traceHasWdata: Boolean = false // ooo wb, so no wdata in trace
}

class VortexTile private (
    val vortexParams: VortexTileParams,
    crossing: ClockCrossingType,
    lookup: LookupByHartIdImpl,
    q: Parameters
) extends BaseTile(vortexParams, crossing, lookup, q)
    with SinksExternalInterrupts
    with SourcesExternalNotifications {
  // Private constructor ensures altered LazyModule.p is used implicitly
  def this(
      params: VortexTileParams,
      crossing: HierarchicalElementCrossingParamsLike,
      lookup: LookupByHartIdImpl
  )(implicit p: Parameters) =
    this(params, crossing.crossingType, lookup, p)

  val intOutwardNode = None
  val slaveNode = TLIdentityNode()
  val masterNode = visibilityNode

  // Memory-mapped region for HTIF communication
  // We use fixed addresses instead of tohost/fromhost
  val regDevice =
    new SimpleDevice("vortex-reg", Seq(s"vortex-reg${tileParams.tileId}"))
  val regNode = TLRegisterNode(
    address = Seq(AddressSet(0x7c000000 + 0x1000 * tileParams.tileId, 0xfff)),
    device = regDevice,
    beatBytes = 4,
    concurrency = 1
  )

  regNode := TLFragmenter(4, 64) := tlSlaveXbar.node

  require(
    p(SIMTCoreKey).isDefined,
    "SIMTCoreKey not defined; make sure to use WithSimtLanes when using VortexTile"
  )
  val numLanes = p(SIMTCoreKey) match {
    case Some(simtParam) => simtParam.nLanes
    case None            => 4
  }

  // CAUTION: imemSourceWidth is dependent on the ibuffer size.  We have to
  // make sure (1 << imemSourceWidth) is smaller than the per-warp ibuffer
  // size; otherwise, more requests than what ibuffer can accommodate can fire,
  // and responses might stall in the downstream.  This migth cause issues when
  // there are also outstanding dmem responses that might get blocked from
  // going back to the core by a previous imem response due to serialization at
  // the narrow tile<->sbus port, leading to a deadlock.
  //
  // This condition should ideally be asserted at elaboration time, but since
  // ibuffer size is set as a hardcoded macro IBUF_SIZE that's uncontrollable
  // from Chisel, there's no easy solution.  We at least don't expose this as a
  // Parameter and leave as a hardcoded value here.
  val imemSourceWidth = 4 // 1 << imemSourceWidth == IBUF_SIZE

  val dmemSourceWidth = p(SIMTCoreKey) match {
    // TODO: respect coalescer newSrcIds
    case Some(simtParam) => log2Ceil(simtParam.nSrcIds)
    case None            => 4
  }
  // require(
  //   dmemSourceWidth >= 4,
  //   "Setting a small number of sourceIds may cause correctness bug inside " +
  //     "Vortex core due to synchronization issues in vx_wspawn. " +
  //     "We recommend setting nSrcIds to at least 16."
  // )

  val smemSourceWidth = 4 // FIXME: hardcoded

  val numWarps = 4 // TODO: parametrize
  val NW_WIDTH = (if (numWarps == 1) 1 else log2Ceil(numWarps))
  val UUID_WIDTH = 44
  val imemTagWidth = UUID_WIDTH + NW_WIDTH
  val numLsuLanes = 4
  // see VX_gpu_pkg.sv
  val LSUQ_SIZE = 8 * (numLanes / numLsuLanes)
  val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/
  val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS
  // dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH
  val smemTagWidth = dmemTagWidth

  val imemNodes = Seq.tabulate(1) { i =>
    TLClientNode(
      Seq(
        TLMasterPortParameters.v1(
          clients = Seq(
            TLMasterParameters.v1(
              sourceId = IdRange(0, 1 << imemSourceWidth),
              name = s"Vortex Core ${vortexParams.tileId} I-Mem $i",
              requestFifo = true,
              supportsProbe =
                TransferSizes(1, lazyCoreParamsView.coreDataBytes),
              supportsGet = TransferSizes(1, lazyCoreParamsView.coreDataBytes)
            )
          )
        )
      )
    )
  }

  val dmemNodes = Seq.tabulate(numLsuLanes) { i =>
    TLClientNode(
      Seq(
        TLMasterPortParameters.v1(
          clients = Seq(
            TLMasterParameters.v1(
              sourceId = IdRange(0, 1 << dmemSourceWidth),
              name = s"Vortex Core ${vortexParams.tileId} D-Mem Lane $i",
              requestFifo = true,
              supportsProbe =
                TransferSizes(1, lazyCoreParamsView.coreDataBytes),
              supportsGet = TransferSizes(1, lazyCoreParamsView.coreDataBytes),
              supportsPutFull =
                TransferSizes(1, lazyCoreParamsView.coreDataBytes),
              supportsPutPartial =
                TransferSizes(1, lazyCoreParamsView.coreDataBytes)
            )
          )
        )
      )
    )
  }

  val smemNodes = Seq.tabulate(numLsuLanes) { i =>
    TLClientNode(
      Seq(
        TLMasterPortParameters.v1(
          clients = Seq(
            TLMasterParameters.v1(
              sourceId = IdRange(0, 1 << smemSourceWidth),
              name = s"Vortex Core ${vortexParams.tileId} SharedMem Lane $i",
              requestFifo = true,
              supportsProbe =
                TransferSizes(1, lazyCoreParamsView.coreDataBytes),
              supportsGet = TransferSizes(1, lazyCoreParamsView.coreDataBytes),
              supportsPutFull =
                TransferSizes(1, lazyCoreParamsView.coreDataBytes),
              supportsPutPartial =
                TransferSizes(1, lazyCoreParamsView.coreDataBytes)
            )
          )
        )
      )
    )
  }

  // combine outgoing per-lane dmemNode into 1 idenity node
  //
  // NOTE: We need TLWidthWidget here because there might be a data width
  // mismatch between Vortex's per-lane response and the system bus when we
  // don't instantiate either L1 or the coalescer.  This _should_ be optimized
  // out when we instantiate either which should handle data width conversion
  // internally (which it does by... using TLWidthWidget).
  val dmemAggregateNode = TLIdentityNode()
  dmemNodes.foreach { dmemAggregateNode := TLWidthWidget(4) := _ }

  val memNode = TLClientNode(
    Seq(
      TLMasterPortParameters.v1(
        clients = Seq(
          TLMasterParameters.v1(
            // FIXME: need to also respect imemSourceWidth
            sourceId = IdRange(0, 1 << dmemSourceWidth),
            name = s"Vortex Core ${vortexParams.tileId} Mem Interface",
            requestFifo = true,
            supportsProbe = TransferSizes(16, 16), // FIXME: hardcoded
            supportsGet = TransferSizes(16, 16),
            supportsPutFull = TransferSizes(16, 16),
            supportsPutPartial = TransferSizes(16, 16)
          )
        )
      )
    )
  )

  // Conditionally instantiate memory coalescer
  val coalescerNode = p(CoalescerKey) match {
    case Some(coalParam) => {
      val coal = LazyModule(
        new CoalescingUnit(coalParam)
      )
      coal.cpuNode :=* dmemAggregateNode
      coal.aggregateNode // N+1 lanes
    }
    case None => dmemAggregateNode
  }

  // Conditionally instantiate L1 cache
  val (icacheNode, dcacheNode): (TLNode, TLNode) = p(VortexL1Key) match {
    case Some(vortexL1Config) => {
      println(
        s"============ Using Vortex L1 cache ================="
      )
      // require(
      //   p(CoalescerKey).isDefined,
      //   "Vortex L1 configuration currently only works when coalescer is also enabled."
      // )

      val icache = LazyModule(new VortexL1Cache(vortexL1Config))
      val dcache = LazyModule(new VortexL1Cache(vortexL1Config))
      // imemNodes.foreach { icache.coresideNode := TLWidthWidget(4) := _ }
      assert(imemNodes.length == 1) // FIXME
      icache.coresideNode := TLWidthWidget(4) := imemNodes(0)
      // dmemNodes go through coalescerNode
      dcache.coresideNode :=* coalescerNode
      (icache.masterNode, dcache.masterNode)
    }
    case None => {
      val imemWideNode = TLIdentityNode()
      assert(imemNodes.length == 1) // FIXME
      imemWideNode := TLWidthWidget(4) := imemNodes(0)
      (imemWideNode, coalescerNode)
    }
  }

  // Instantiate sharedmem banks
  //
  // Instantiate the same number of banks as there are lanes.
  // TODO: parametrize
  val smemBanks = Seq.tabulate(numLsuLanes) { bankId =>
    // Banked-by-word (4 bytes)
    // base for bank 1: ff...000000|01|00
    // mask for bank 1; 00...111111|00|11
    val base = 0xff000000L | (bankId * 4 /*wordSize*/ )
    val mask = 0x00001fffL ^ ((numLsuLanes - 1) * 4 /*wordSize*/ )
    LazyModule(new TLRAM(AddressSet(base, mask), beatBytes = 4 /*wordSize*/ ))
  }
  // smem lanes-to-banks crossbar
  val smemXbar = LazyModule(new TLXbar)
  smemNodes.foreach(smemXbar.node := _)
  smemBanks.foreach(_.node := smemXbar.node)

  if (vortexParams.useVxCache) {
    tlMasterXbar.node := TLWidthWidget(16) := memNode
  } else {
    // imemNodes.foreach { tlMasterXbar.node := TLWidthWidget(4) := _ }
    tlMasterXbar.node :=* icacheNode
    tlMasterXbar.node :=* dcacheNode
  }

  // ROCC
  // TODO: parametrize
  val gemmini = LazyModule(new Gemmini(GemminiCustomConfigs.unifiedMemConfig))
  val roccs: Seq[LazyRoCC] = Seq(gemmini)
  tlMasterXbar.node :=* gemmini.atlNode
  tlOtherMastersNode :=* gemmini.tlNode

  gemmini.stlNode :=* TLWidthWidget(4) :=* smemXbar.node

  /* below are copied from rocket */

  // val bus_error_unit = vortexParams.beuAddr map { a =>
  //   val beu =
  //     LazyModule(new BusErrorUnit(new L1BusErrors, BusErrorUnitParams(a)))
  //   intOutwardNode := beu.intNode
  //   connectTLSlave(beu.node, xBytes)
  //   beu
  // }

  val tile_master_blocker =
    tileParams.blockerCtrlAddr
      .map(
        BasicBusBlockerParams(_, xBytes, masterPortBeatBytes, deadlock = true)
      )
      .map(bp => LazyModule(new BasicBusBlocker(bp)))

  tile_master_blocker.foreach(lm => connectTLSlave(lm.controlNode, xBytes))

  // TODO: this doesn't block other masters, e.g. RoCCs
  tlOtherMastersNode := tile_master_blocker.map {
    _.node := tlMasterXbar.node
  } getOrElse { tlMasterXbar.node }
  masterNode :=* tlOtherMastersNode
  DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode }

  val dtimProperty =
    Nil // Seq(dmemDevice.asProperty).flatMap(p => Map("sifive,dtim" -> p))

  val itimProperty =
    Nil // frontend.icache.itimProperty.toSeq.flatMap(p => Map("sifive,itim" -> p))

  // missing bus_error_unit

  val cpuDevice: SimpleDevice = new SimpleDevice(
    "cpu",
    Seq(s"sifive,vortex${tileParams.tileId}", "riscv")
  ) {
    override def parent = Some(ResourceAnchors.cpus)
    override def describe(resources: ResourceBindings): Description = {
      val Description(name, mapping) = super.describe(resources)
      Description(
        name,
        mapping ++ cpuProperties ++ nextLevelCacheProperty
          ++ tileProperties ++ dtimProperty ++ itimProperty /*++ beuProperty*/
      )
    }
  }

  ResourceBinding {
    Resource(cpuDevice, "reg").bind(ResourceAddress(tileId))
  }

  override lazy val module = new VortexTileModuleImp(this)

  override def makeMasterBoundaryBuffers(
      crossing: ClockCrossingType
  )(implicit p: Parameters) = (vortexParams.boundaryBuffers, crossing) match {
    case (Some(RocketTileBoundaryBufferParams(true)), _) => TLBuffer()
    case (Some(RocketTileBoundaryBufferParams(false)), _: RationalCrossing) =>
      TLBuffer(
        BufferParams.none,
        BufferParams.flow,
        BufferParams.none,
        BufferParams.flow,
        BufferParams(1)
      )
    case _ => TLBuffer(BufferParams.none)
  }

  override def makeSlaveBoundaryBuffers(
      crossing: ClockCrossingType
  )(implicit p: Parameters) = (vortexParams.boundaryBuffers, crossing) match {
    case (Some(RocketTileBoundaryBufferParams(true)), _) => TLBuffer()
    case (Some(RocketTileBoundaryBufferParams(false)), _: RationalCrossing) =>
      TLBuffer(
        BufferParams.flow,
        BufferParams.none,
        BufferParams.none,
        BufferParams.none,
        BufferParams.none
      )
    case _ => TLBuffer(BufferParams.none)
  }
}

class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
  Annotated.params(this, outer.vortexParams)

  val core = Module(new Vortex(outer)(outer.p))

  core.io.clock := clock
  core.io.reset := reset

  // begin @copypaste from RocketTile ------------------------------------------

  // reset vector is connected in the Frontend to s2_pc
  core.io.reset_vector := DontCare

  outer.regNode.regmap(
    0x00 -> Seq(RegField.r(32, core.io.finished))
  )

  // Report when the tile has ceased to retire instructions
  outer.reportCease(Some(core.io.finished))

  outer.reportWFI(Some(core.io.wfi))

  outer.decodeCoreInterrupts(core.io.interrupts) // Decode the interrupt vector

  // outer.bus_error_unit.foreach { beu =>
  //   core.io.interrupts.buserror.get := beu.module.io.interrupt
  // }

  core.io.interrupts.nmi.foreach { nmi => nmi := outer.nmiSinkNode.get.bundle }

  // Pass through various external constants and reports that were bundle-bridged into the tile
  // outer.traceSourceNode.bundle <> core.io.trace
  core.io.traceStall := outer.traceAuxSinkNode.bundle.stall
  // outer.bpwatchSourceNode.bundle <> core.io.bpwatch

  // not necessary for Vortex as hartId is set via Verilog parameter
  // core.io.hartid := outer.hartIdSinkNode.bundle
  // require(core.io.hartid.getWidth >= outer.hartIdSinkNode.bundle.getWidth,
  //   s"core hartid wire (${core.io.hartid.getWidth}b) truncates external hartid wire (${outer.hartIdSinkNode.bundle.getWidth}b)")

  // end @copypaste from RocketTile --------------------------------------------

  // ---------------------------------------------
  // Translate Vortex memory interface to TileLink
  // ---------------------------------------------

  if (outer.vortexParams.useVxCache) {
    println(s"width of a channel data ${core.io.mem.get.a.bits.data.getWidth}")
    println(s"width of d channel data ${core.io.mem.get.d.bits.data.getWidth}")

    val memTLAdapter = Module(
      new VortexTLAdapter(
        outer.dmemSourceWidth,
        chiselTypeOf(core.io.mem.get.a.bits),
        chiselTypeOf(core.io.mem.get.d.bits),
        outer.memNode.out.head
      )
    )

    // connection: VortexBundle <--> VortexTLAdapter <--> TL memNode
    memTLAdapter.io.inReq <> core.io.mem.get.a
    core.io.mem.get.d <> memTLAdapter.io.inResp
    outer.memNode.out(0)._1.a <> memTLAdapter.io.outReq
    memTLAdapter.io.outResp <> outer.memNode.out(0)._1.d
  } else {
    def connectImem = {
      val imemTLAdapter = Module(
        new VortexTLAdapter(
          outer.imemSourceWidth,
          chiselTypeOf(core.io.imem.get(0).a.bits),
          chiselTypeOf(core.io.imem.get(0).d.bits),
          outer.imemNodes.head.out.head
        )
      )
      // TODO: make imemNodes not a vector
      imemTLAdapter.io.inReq <> core.io.imem.get(0).a
      core.io.imem.get(0).d <> imemTLAdapter.io.inResp
      outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq
      imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d
    }

    def connectDmem = {
      // @perf: this would duplicate SourceGenerator table for every lane and eat
      // up some area
      val dmemTLBundles = outer.dmemNodes.map(_.out.head._1)
      val dmemTLAdapters = Seq.tabulate(outer.numLsuLanes) { _ =>
        Module(
          new VortexTLAdapter(
            outer.dmemSourceWidth,
            new VortexBundleA(tagWidth = outer.dmemTagWidth, dataWidth = 32),
            new VortexBundleD(tagWidth = outer.dmemTagWidth, dataWidth = 32),
            outer.dmemNodes(0).out.head
          )
        )
      }

      // Since the individual per-lane TL requests might come back out-of-sync between
      // the lanes, but Vortex core expects the per-lane responses to be synced,
      // we need to selectively fire responses that have the same source, and
      // delay others.
      //
      // In order to do that, we pick a source from one of the valid lanes using e.g.
      // an arbiter.  Then using the chosen source id, we
      // - lie to core that response is not valid if source doesn't match picked, and
      // - lie to downstream that core is not ready if source doesn't match picked.
      //
      // Note that we cannot do this filtering logic using TileLink source ID, because
      // we're allocating source for each lane independently.  In that case, it's
      // possible that lane 0's source matches lane 1/2/3's source by chance,
      // even when they originated from different warps.  Using Vortex's dcache req tag
      // solves this issue because they use a UUID that is unique across all requests
      // in the program.
      //
      // TODO: A cleaner solution would be to simply do a synchronized allocation
      // of a same source id for all lanes.
      val arb = Module(
        new RRArbiter(
          // FIXME: should really be source on D channel
          new VortexBundleA(tagWidth = outer.dmemTagWidth, dataWidth = 32).source.cloneType,
          outer.numLsuLanes
        )
      )
      arb.io.out.ready := true.B
      val dmemBundles = dmemTLAdapters.map(_.io.inResp)
      (arb.io.in zip dmemBundles).foreach { case (arbIn, vxDmem) =>
        arbIn.valid := vxDmem.valid
        arbIn.bits := vxDmem.bits.source
      }
      val matchingSources = Wire(UInt(outer.numLsuLanes.W))
      matchingSources := dmemBundles
        .map(b =>
          // If there is no valid response pending across all lanes,
          // matchingSources should not filter out upstream ready signals, so
          // set it to all-1
          !arb.io.out.valid || (b.bits.source === arb.io.out.bits)
        )
        .asUInt

      // make connection:
      // VortexBundle <--> sourceId filter <--> VortexTLAdapter <--> dmemNodes
      //
      // Chisel doesn't support 2-D array in BlackBox interface to Verilog, so
      // need to flatten everything.
      dmemTLAdapters.zipWithIndex.foreach {
        case (tlAdapter, i) =>
          // tlAdapter.io.inReq <> coreMem.a
          tlAdapter.io.inReq.valid := core.io.dmem_a_valid(i)
          tlAdapter.io.inReq.bits.opcode := core.io.dmem_a_bits_opcode(3 * (i + 1) - 1, 3 * i)
          tlAdapter.io.inReq.bits.size := core.io.dmem_a_bits_size(4 * (i + 1) - 1, 4 * i)
          tlAdapter.io.inReq.bits.source := core.io.dmem_a_bits_source(outer.dmemTagWidth * (i + 1) - 1, outer.dmemTagWidth * i)
          tlAdapter.io.inReq.bits.address := core.io.dmem_a_bits_address(32 * (i + 1) - 1, 32 * i)
          tlAdapter.io.inReq.bits.mask := core.io.dmem_a_bits_mask(4 * (i + 1) - 1, 4 * i)
          tlAdapter.io.inReq.bits.data := core.io.dmem_a_bits_data(32 * (i + 1) - 1, 32 * i)
      }
      core.io.dmem_a_ready := dmemTLAdapters.map(_.io.inReq.ready).asUInt

      core.io.dmem_d_valid := dmemTLAdapters.map(_.io.inResp.valid).asUInt
      core.io.dmem_d_bits_opcode := dmemTLAdapters.map(_.io.inResp.bits.opcode).asUInt
      core.io.dmem_d_bits_size := dmemTLAdapters.map(_.io.inResp.bits.size).asUInt
      core.io.dmem_d_bits_source := dmemTLAdapters.map(_.io.inResp.bits.source).asUInt
      core.io.dmem_d_bits_data := dmemTLAdapters.map(_.io.inResp.bits.data).asUInt

      // override response channel with matchingSources
      val dmem_d_valid_vec = Wire(Vec(outer.numLsuLanes, Bool()))
      dmemTLAdapters.zipWithIndex.foreach {
        case (tlAdapter, i) =>
          dmem_d_valid_vec(i) := tlAdapter.io.inResp.valid && matchingSources(i)
          tlAdapter.io.inResp.ready := core.io.dmem_d_ready(i) && matchingSources(i)
      }
      core.io.dmem_d_valid := dmem_d_valid_vec.asUInt

      (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlOut) =>
        tlOut.a <> tlAdapter.io.outReq
        tlAdapter.io.outResp <> tlOut.d
      }

      outer.dmemAggregateNode.out.foreach { bo =>
        dontTouch(bo._1.a)
        dontTouch(bo._1.d)
      }
    }

    def connectSmem = {
      // @perf: this would duplicate SourceGenerator table for every lane and eat
      // up some area
      val smemTLBundles = outer.smemNodes.map(_.out.head._1)
      val smemTLAdapters = Seq.tabulate(outer.numLsuLanes) { _ =>
        Module(
          new VortexTLAdapter(
            outer.smemSourceWidth,
            new VortexBundleA(tagWidth = outer.smemTagWidth, dataWidth = 32),
            new VortexBundleD(tagWidth = outer.smemTagWidth, dataWidth = 32),
            outer.smemNodes(0).out.head
          )
        )
      }

      smemTLAdapters.zipWithIndex.foreach {
        case (tlAdapter, i) =>
          // tlAdapter.io.inReq <> coreMem.a
          tlAdapter.io.inReq.valid := core.io.smem_a_valid(i)
          tlAdapter.io.inReq.bits.opcode := core.io.smem_a_bits_opcode(3 * (i + 1) - 1, 3 * i)
          tlAdapter.io.inReq.bits.size := core.io.smem_a_bits_size(4 * (i + 1) - 1, 4 * i)
          tlAdapter.io.inReq.bits.source := core.io.smem_a_bits_source(outer.smemTagWidth * (i + 1) - 1, outer.smemTagWidth * i)
          tlAdapter.io.inReq.bits.address := core.io.smem_a_bits_address(32 * (i + 1) - 1, 32 * i)
          tlAdapter.io.inReq.bits.mask := core.io.smem_a_bits_mask(4 * (i + 1) - 1, 4 * i)
          tlAdapter.io.inReq.bits.data := core.io.smem_a_bits_data(32 * (i + 1) - 1, 32 * i)
      }
      core.io.smem_a_ready := smemTLAdapters.map(_.io.inReq.ready).asUInt

      core.io.smem_d_valid := smemTLAdapters.map(_.io.inResp.valid).asUInt
      core.io.smem_d_bits_opcode := smemTLAdapters.map(_.io.inResp.bits.opcode).asUInt
      core.io.smem_d_bits_size := smemTLAdapters.map(_.io.inResp.bits.size).asUInt
      core.io.smem_d_bits_source := smemTLAdapters.map(_.io.inResp.bits.source).asUInt
      core.io.smem_d_bits_data := smemTLAdapters.map(_.io.inResp.bits.data).asUInt
      smemTLAdapters.zipWithIndex.foreach {
        case (tlAdapter, i) =>
          tlAdapter.io.inResp.ready := core.io.smem_d_ready(i)
      }

      (smemTLAdapters zip smemTLBundles) foreach { case (tlAdapter, tlOut) =>
        tlOut.a <> tlAdapter.io.outReq
        tlAdapter.io.outResp <> tlOut.d
      }
    }

    connectImem
    connectDmem
    connectSmem
  }

  // TODO: generalize for useVxCache
  if (!outer.vortexParams.useVxCache) {}

  // RoCC
  if (outer.roccs.size > 0) {
    val (respArb, cmdRouter) = {
      val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size))
      val cmdRouter = Module(new RoccCommandRouter(outer.roccs.map(_.opcodes))(outer.p))
      outer.roccs.zipWithIndex.foreach { case (rocc, i) =>
        // ptwPorts ++= rocc.module.io.ptw
        rocc.module.io.ptw <> DontCare
        rocc.module.io.mem <> DontCare
        rocc.module.io.cmd <> cmdRouter.io.out(i)
        respArb.io.in(i) <> Queue(rocc.module.io.resp)
      }
      // Create this FPU just for RoCC
      // val nFPUPorts = outer.roccs.filter(_.usesFPU).size
      val fp_rocc_ios = outer.roccs.map(_.module.io)
      fp_rocc_ios.map{ io =>
        io.fpu_req.ready := false.B
        io.fpu_resp.valid := false.B
        io.fpu_resp.bits := DontCare
      }
      (respArb, cmdRouter)
    }

    cmdRouter.io.in <> DontCare
    outer.roccs.foreach(_.module.io.exception := DontCare)
    respArb.io.out <> DontCare
  }
}

// Some @copypaste from CoalescerSourceGen.
class VortexTLAdapter(
    newSourceWidth: Int,
    inReqT: VortexBundleA,
    inRespT: VortexBundleD,
    outTL: (TLBundle, TLEdge)
) extends Module {
  val io = IO(new Bundle {
    // in/out means upstream/downstream
    val inReq = Flipped(Decoupled(inReqT))
    val outReq = chiselTypeOf(outTL._1.a)
    val inResp = Decoupled(inRespT)
    val outResp = chiselTypeOf(outTL._1.d)
  })
  val (bundle, edge) = outTL
  val sourceGen = Module(
    new SourceGenerator(
      newSourceWidth,
      Some(inReqT.source),
      ignoreInUse = false
    )
  )
  sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created
  sourceGen.io.reclaim.valid := io.outResp.fire
  sourceGen.io.reclaim.bits := io.outResp.bits.source
  sourceGen.io.meta := io.inReq.bits.source

  // io passthrough logic
  // TLBundleA <> VortexBundleA
  io.outReq.valid := io.inReq.valid
  io.outReq.bits.opcode := io.inReq.bits.opcode
  io.outReq.bits.param := 0.U
  io.outReq.bits.size := io.inReq.bits.size
  io.outReq.bits.source := io.inReq.bits.source
  io.outReq.bits.address := io.inReq.bits.address
  // Get requires contiguous mask; only copy core's potentially-partial mask
  // when writing
  io.outReq.bits.mask := Mux(
    edge.hasData(io.outReq.bits),
    io.inReq.bits.mask,
    // generate TL-correct mask
    edge.mask(io.inReq.bits.address, io.inReq.bits.size)
  )
  io.outReq.bits.data := io.inReq.bits.data
  io.outReq.bits.corrupt := 0.U
  io.inReq.ready := io.outReq.ready
  // VortexBundleD <> TLBundleD
  // Filtering out write requests is handled inside the wrapper Verilog
  io.inResp.valid := io.outResp.valid
  io.inResp.bits.opcode := io.outResp.bits.opcode
  io.inResp.bits.size := io.outResp.bits.size
  io.inResp.bits.source := io.outResp.bits.source
  io.inResp.bits.data := io.outResp.bits.data
  io.outResp.ready := io.inResp.ready

  // "man-in-the-middle"
  io.inReq.ready := io.outReq.ready && sourceGen.io.id.valid
  io.outReq.valid := io.inReq.valid && sourceGen.io.id.valid
  io.outReq.bits.source := sourceGen.io.id.bits
  // translate upstream response back to its old sourceId
  io.inResp.bits.source := sourceGen.io.peek
}