diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex
index a654c9b..59efba2 160000
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
@@ -1 +1 @@
-Subproject commit a654c9b8b23b72855a4077f356cc9f22c983a934
+Subproject commit 59efba2b7024910a5b35195bf38bf86120bf1403
diff --git a/src/main/scala/rocket/VortexCore.scala b/src/main/scala/rocket/VortexCore.scala
index 53bd707..366d8fd 100644
--- a/src/main/scala/rocket/VortexCore.scala
+++ b/src/main/scala/rocket/VortexCore.scala
@@ -8,10 +8,32 @@ import chisel3.util._
 import chisel3.experimental._
 import org.chipsalliance.cde.config.Parameters
 import freechips.rocketchip.tile._
-import freechips.rocketchip.util._
-import freechips.rocketchip.scie._
 import tile.VortexTile
 
+class VortexBundleA(
+  sourceWidth: Int,
+  dataWidth: Int
+) extends Bundle {
+  assert(dataWidth % 8 == 0)
+  val opcode = UInt(3.W) // FIXME: hardcoded
+  val size = UInt(4.W) // FIXME: hardcoded
+  val source = UInt(sourceWidth.W) // FIXME: hardcoded
+  val address = UInt(32.W) // FIXME: hardcoded
+  val mask = UInt((dataWidth / 8).W) // FIXME: hardcoded
+  val data = UInt(dataWidth.W) // FIXME: hardcoded
+}
+
+class VortexBundleD(
+  sourceWidth: Int,
+  dataWidth: Int
+) extends Bundle {
+  assert(dataWidth % 8 == 0)
+  val opcode = UInt(3.W) // FIXME: hardcoded
+  val size = UInt(4.W) // FIXME: hardcoded
+  val source = UInt(sourceWidth.W) // FIXME: hardcoded
+  val data = UInt(dataWidth.W) // FIXME: hardcoded
+}
+
 class VortexBundle(tile: VortexTile)(implicit p: Parameters) extends CoreBundle {
   val clock = Input(Clock())
   val reset = Input(Reset())
@@ -20,17 +42,19 @@ class VortexBundle(tile: VortexTile)(implicit p: Parameters) extends CoreBundle
   val interrupts = Input(new CoreInterrupts())
   
   // conditionally instantiate ports depending on whether we want to use VX_cache or not
-  val imem = if (!tile.vortexParams.useVxCache) Some(Vec(1, new Bundle { // TODO: magic number
-    val a = tile.imemNodes.head.out.head._1.a.cloneType
-    val d = Flipped(tile.imemNodes.head.out.head._1.d.cloneType)
+  val imem = if (!tile.vortexParams.useVxCache) Some(Vec(1, new Bundle {
+    val a = Decoupled(new VortexBundleA(sourceWidth = 10, dataWidth = 32))
+    val d = Flipped(Decoupled(new VortexBundleD(sourceWidth = 10, dataWidth = 32)))
   })) else None
-  val dmem = if (!tile.vortexParams.useVxCache) Some(Vec(4, new Bundle {
-    val a = tile.dmemNodes.head.out.head._1.a.cloneType
-    val d = Flipped(tile.dmemNodes.head.out.head._1.d.cloneType)
+  val dmem = if (!tile.vortexParams.useVxCache) Some(Vec(tile.numLanes, new Bundle {
+    val a = Decoupled(new VortexBundleA(sourceWidth = 10, dataWidth = 32))
+    val d = Flipped(Decoupled(new VortexBundleD(sourceWidth = 10, dataWidth = 32)))
   })) else None
   val mem = if (tile.vortexParams.useVxCache) Some(new Bundle { 
-    val a = tile.memNode.out.head._1.a.cloneType
-    val d = Flipped(tile.memNode.out.head._1.d.cloneType)
+    val a = Decoupled(new VortexBundleA(sourceWidth = 15, dataWidth = 128))
+    val d = Flipped(Decoupled(new VortexBundleD(sourceWidth = 15, dataWidth = 128)))
+    // val a = tile.memNode.out.head._1.a.cloneType
+    // val d = Flipped(tile.memNode.out.head._1.d.cloneType)
   }) else None
 
   // val fpu = Flipped(new FPUCoreIO())
diff --git a/src/main/scala/tile/VortexTile.scala b/src/main/scala/tile/VortexTile.scala
index 7e0d570..4e33f3e 100644
--- a/src/main/scala/tile/VortexTile.scala
+++ b/src/main/scala/tile/VortexTile.scala
@@ -4,7 +4,7 @@
 package tile
 
 import chisel3._
-import chisel3.util.RRArbiter
+import chisel3.util._
 import org.chipsalliance.cde.config._
 import freechips.rocketchip.devices.tilelink._
 import freechips.rocketchip.diplomacy._
@@ -16,9 +16,7 @@ import freechips.rocketchip.util._
 import freechips.rocketchip.prci.ClockSinkParameters
 import freechips.rocketchip.regmapper.RegField
 import freechips.rocketchip.tile._
-import rocket.Vortex
-
-import scala.collection.mutable.ListBuffer
+import rocket.{Vortex, VortexBundleA, VortexBundleD}
 
 case class RocketTileBoundaryBufferParams(force: Boolean = false)
 
@@ -35,26 +33,31 @@ case class VortexTileParams(
     blockerCtrlAddr: Option[BigInt] = None,
     clockSinkParams: ClockSinkParameters = ClockSinkParameters(),
     boundaryBuffers: Option[RocketTileBoundaryBufferParams] = None
-    ) extends InstantiableTileParams[VortexTile] {
+) extends InstantiableTileParams[VortexTile] {
   // require(icache.isDefined)
-  // require(dcache.isDefined) 
+  // require(dcache.isDefined)
 
-  def instantiate(crossing: TileCrossingParamsLike, lookup: LookupByHartIdImpl)(implicit p: Parameters): VortexTile = {
+  def instantiate(crossing: TileCrossingParamsLike, lookup: LookupByHartIdImpl)(
+      implicit p: Parameters
+  ): VortexTile = {
     new VortexTile(this, crossing, lookup)
   }
 }
 
-class VortexTile private(
-        val vortexParams: VortexTileParams,
-        crossing: ClockCrossingType,
-        lookup: LookupByHartIdImpl,
-        q: Parameters)
-    extends BaseTile(vortexParams, crossing, lookup, q)
+class VortexTile private (
+    val vortexParams: VortexTileParams,
+    crossing: ClockCrossingType,
+    lookup: LookupByHartIdImpl,
+    q: Parameters
+) extends BaseTile(vortexParams, crossing, lookup, q)
     with SinksExternalInterrupts
-    with SourcesExternalNotifications
-{
+    with SourcesExternalNotifications {
   // Private constructor ensures altered LazyModule.p is used implicitly
-  def this(params: VortexTileParams, crossing: TileCrossingParamsLike, lookup: LookupByHartIdImpl)(implicit p: Parameters) =
+  def this(
+      params: VortexTileParams,
+      crossing: TileCrossingParamsLike,
+      lookup: LookupByHartIdImpl
+  )(implicit p: Parameters) =
     this(params, crossing.crossingType, lookup, p)
 
   val intOutwardNode = IntIdentityNode()
@@ -87,40 +90,70 @@ class VortexTile private(
     beatBytes = lazyCoreParamsView.coreDataBytes,
     minLatency = 1)))*/
 
-  val imemNodes = Seq.tabulate(1) { i => TLClientNode(Seq(TLMasterPortParameters.v1(
-    clients = Seq(TLMasterParameters.v1(
-      sourceId = IdRange(0, 1 << 10), // TODO magic number
-      name = s"Vortex Core ${vortexParams.hartId} I-Mem $i",
-      requestFifo = true,
-      supportsProbe = TransferSizes(1, lazyCoreParamsView.coreDataBytes),
-      supportsGet = TransferSizes(1, lazyCoreParamsView.coreDataBytes)
-    ))
-  )))}
+  val numLanes = 4 // TODO: use Parameters for this
+  val sourceWidth = 4 // TODO: use Parameters for this
 
-  val dmemNodes = Seq.tabulate(4) { i => TLClientNode(Seq(TLMasterPortParameters.v1(
-    clients = Seq(TLMasterParameters.v1(
-      sourceId = IdRange(0, 1 << 10), // TODO magic number
-      name = s"Vortex Core ${vortexParams.hartId} D-Mem Lane $i",
-      requestFifo = true,
-      supportsProbe = TransferSizes(1, lazyCoreParamsView.coreDataBytes),
-      supportsGet = TransferSizes(1, lazyCoreParamsView.coreDataBytes),
-      supportsPutFull = TransferSizes(1, lazyCoreParamsView.coreDataBytes),
-      supportsPutPartial = TransferSizes(1, lazyCoreParamsView.coreDataBytes)
-    ))
-  )))}
+  val imemNodes = Seq.tabulate(1) { i =>
+    TLClientNode(
+      Seq(
+        TLMasterPortParameters.v1(
+          clients = Seq(
+            TLMasterParameters.v1(
+              sourceId = IdRange(0, 1 << sourceWidth),
+              name = s"Vortex Core ${vortexParams.hartId} I-Mem $i",
+              requestFifo = true,
+              supportsProbe =
+                TransferSizes(1, lazyCoreParamsView.coreDataBytes),
+              supportsGet = TransferSizes(1, lazyCoreParamsView.coreDataBytes)
+            )
+          )
+        )
+      )
+    )
+  }
+
+  val dmemNodes = Seq.tabulate(numLanes) { i =>
+    TLClientNode(
+      Seq(
+        TLMasterPortParameters.v1(
+          clients = Seq(
+            TLMasterParameters.v1(
+              sourceId = IdRange(0, 1 << sourceWidth),
+              name = s"Vortex Core ${vortexParams.hartId} D-Mem Lane $i",
+              requestFifo = true,
+              supportsProbe =
+                TransferSizes(1, lazyCoreParamsView.coreDataBytes),
+              supportsGet = TransferSizes(1, lazyCoreParamsView.coreDataBytes),
+              supportsPutFull =
+                TransferSizes(1, lazyCoreParamsView.coreDataBytes),
+              supportsPutPartial =
+                TransferSizes(1, lazyCoreParamsView.coreDataBytes)
+            )
+          )
+        )
+      )
+    )
+  }
+
+  println(s"============= lazyCoreParamsView.coreDataBytes=${lazyCoreParamsView.coreDataBytes}")
+  val memNode = TLClientNode(
+    Seq(
+      TLMasterPortParameters.v1(
+        clients = Seq(
+          TLMasterParameters.v1(
+            sourceId = IdRange(0, 1 << sourceWidth),
+            name = s"Vortex Core ${vortexParams.hartId} Mem Interface",
+            requestFifo = true,
+            supportsProbe = TransferSizes(16, 16), // FIXME: hardcoded
+            supportsGet = TransferSizes(16, 16),
+            supportsPutFull = TransferSizes(16, 16),
+            supportsPutPartial = TransferSizes(16, 16)
+          )
+        )
+      )
+    )
+  )
 
-  val memNode = TLClientNode(Seq(TLMasterPortParameters.v1(
-    clients = Seq(TLMasterParameters.v1(
-      sourceId = IdRange(0, 1 << 15), // TODO magic numbers
-      name = s"Vortex Core ${vortexParams.hartId} Mem Interface",
-      requestFifo = true,
-      supportsProbe = TransferSizes(16, 16),
-      supportsGet = TransferSizes(16, 16),
-      supportsPutFull = TransferSizes(16, 16),
-      supportsPutPartial = TransferSizes(16, 16)
-    )),
-  )))
-  
   if (vortexParams.useVxCache) {
     tlMasterXbar.node := TLWidthWidget(16) := memNode
   } else {
@@ -131,7 +164,8 @@ class VortexTile private(
   /* below are copied from rocket */
 
   val bus_error_unit = vortexParams.beuAddr map { a =>
-    val beu = LazyModule(new BusErrorUnit(new L1BusErrors, BusErrorUnitParams(a)))
+    val beu =
+      LazyModule(new BusErrorUnit(new L1BusErrors, BusErrorUnitParams(a)))
     intOutwardNode := beu.intNode
     connectTLSlave(beu.node, xBytes)
     beu
@@ -139,13 +173,17 @@ class VortexTile private(
 
   val tile_master_blocker =
     tileParams.blockerCtrlAddr
-      .map(BasicBusBlockerParams(_, xBytes, masterPortBeatBytes, deadlock = true))
+      .map(
+        BasicBusBlockerParams(_, xBytes, masterPortBeatBytes, deadlock = true)
+      )
       .map(bp => LazyModule(new BasicBusBlocker(bp)))
 
   tile_master_blocker.foreach(lm => connectTLSlave(lm.controlNode, xBytes))
 
   // TODO: this doesn't block other masters, e.g. RoCCs
-  tlOtherMastersNode := tile_master_blocker.map { _.node := tlMasterXbar.node } getOrElse { tlMasterXbar.node }
+  tlOtherMastersNode := tile_master_blocker.map {
+    _.node := tlMasterXbar.node
+  } getOrElse { tlMasterXbar.node }
   masterNode :=* tlOtherMastersNode
   DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode }
 
@@ -163,7 +201,6 @@ class VortexTile private(
       Description(name, mapping ++ cpuProperties ++ nextLevelCacheProperty
                   ++ tileProperties ++ dtimProperty ++ itimProperty ++ beuProperty)
     }
-  }
 
   ResourceBinding {
     Resource(cpuDevice, "reg").bind(ResourceAddress(staticIdForMetadataUseOnly))
@@ -171,15 +208,33 @@ class VortexTile private(
 
   override lazy val module = new VortexTileModuleImp(this)
 
-  override def makeMasterBoundaryBuffers(crossing: ClockCrossingType)(implicit p: Parameters) = (vortexParams.boundaryBuffers, crossing) match {
-    case (Some(RocketTileBoundaryBufferParams(true )), _)                   => TLBuffer()
-    case (Some(RocketTileBoundaryBufferParams(false)), _: RationalCrossing) => TLBuffer(BufferParams.none, BufferParams.flow, BufferParams.none, BufferParams.flow, BufferParams(1))
+  override def makeMasterBoundaryBuffers(
+      crossing: ClockCrossingType
+  )(implicit p: Parameters) = (vortexParams.boundaryBuffers, crossing) match {
+    case (Some(RocketTileBoundaryBufferParams(true)), _) => TLBuffer()
+    case (Some(RocketTileBoundaryBufferParams(false)), _: RationalCrossing) =>
+      TLBuffer(
+        BufferParams.none,
+        BufferParams.flow,
+        BufferParams.none,
+        BufferParams.flow,
+        BufferParams(1)
+      )
     case _ => TLBuffer(BufferParams.none)
   }
 
-  override def makeSlaveBoundaryBuffers(crossing: ClockCrossingType)(implicit p: Parameters) = (vortexParams.boundaryBuffers, crossing) match {
-    case (Some(RocketTileBoundaryBufferParams(true )), _)                   => TLBuffer()
-    case (Some(RocketTileBoundaryBufferParams(false)), _: RationalCrossing) => TLBuffer(BufferParams.flow, BufferParams.none, BufferParams.none, BufferParams.none, BufferParams.none)
+  override def makeSlaveBoundaryBuffers(
+      crossing: ClockCrossingType
+  )(implicit p: Parameters) = (vortexParams.boundaryBuffers, crossing) match {
+    case (Some(RocketTileBoundaryBufferParams(true)), _) => TLBuffer()
+    case (Some(RocketTileBoundaryBufferParams(false)), _: RationalCrossing) =>
+      TLBuffer(
+        BufferParams.flow,
+        BufferParams.none,
+        BufferParams.none,
+        BufferParams.none,
+        BufferParams.none
+      )
     case _ => TLBuffer(BufferParams.none)
   }
 }
@@ -188,7 +243,7 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
   Annotated.params(this, outer.vortexParams)
 
   val core = Module(new Vortex(outer)(outer.p))
-  
+
   core.io.clock := clock
   core.io.reset := reset
 
@@ -200,8 +255,7 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
   )
 
   // Report when the tile has ceased to retire instructions; for now the only cause is clock gating
-  outer.reportCease(outer.vortexParams.core.clockGate.option(
-    core.io.cease))
+  outer.reportCease(outer.vortexParams.core.clockGate.option(core.io.cease))
 
   outer.reportWFI(Some(core.io.wfi))
 
@@ -223,49 +277,103 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
   // require(core.io.hartid.getWidth >= outer.hartIdSinkNode.bundle.getWidth,
   //   s"core hartid wire (${core.io.hartid.getWidth}b) truncates external hartid wire (${outer.hartIdSinkNode.bundle.getWidth}b)")
 
+  // ---------------------------------------------
+  // Translate Vortex memory interface to TileLink
+  // ---------------------------------------------
+
   if (outer.vortexParams.useVxCache) {
     println(s"width of a channel data ${core.io.mem.get.a.bits.data.getWidth}")
     println(s"width of d channel data ${core.io.mem.get.d.bits.data.getWidth}")
-    core.io.mem.get.a <> outer.memNode.out.head._1.a
-    core.io.mem.get.d <> outer.memNode.out.head._1.d
-  }
-  else {
-    (core.io.imem.get zip outer.imemNodes).foreach { case (coreMem, tileNode) =>
-      coreMem.d <> tileNode.out.head._1.d
-      coreMem.a <> tileNode.out.head._1.a
-    }
 
-    // pick source id and:
+    val memTLAdapter =  Module(new VortexTLAdapter(
+      outer.sourceWidth,
+      chiselTypeOf(core.io.mem.get.a.bits),
+      chiselTypeOf(core.io.mem.get.d.bits),
+      chiselTypeOf(outer.memNode.out.head._1.a.bits),
+      chiselTypeOf(outer.memNode.out.head._1.d.bits),
+    ))
+
+    // connection: VortexBundle <--> VortexTLAdapter <--> TL memNode
+    memTLAdapter.io.inReq <> core.io.mem.get.a
+    core.io.mem.get.d <> memTLAdapter.io.inResp
+    outer.memNode.out(0)._1.a <> memTLAdapter.io.outReq
+    memTLAdapter.io.outResp <> outer.memNode.out(0)._1.d
+
+    // core.io.mem.get.a <> outer.memNode.out.head._1.a
+    // core.io.mem.get.d <> outer.memNode.out.head._1.d
+  } else {
+    val imemTLAdapter =  Module(new VortexTLAdapter(
+        outer.sourceWidth,
+        chiselTypeOf(core.io.imem.get(0).a.bits),
+        chiselTypeOf(core.io.imem.get(0).d.bits),
+        chiselTypeOf(outer.imemNodes.head.out.head._1.a.bits),
+        chiselTypeOf(outer.imemNodes.head.out.head._1.d.bits),
+    ))
+    // TODO: make imemNodes not a vector
+    imemTLAdapter.io.inReq <> core.io.imem.get(0).a
+    core.io.imem.get(0).d <> imemTLAdapter.io.inResp
+    outer.imemNodes(0).out(0)._1.a <> imemTLAdapter.io.outReq
+    imemTLAdapter.io.outResp <> outer.imemNodes(0).out(0)._1.d
+
+    // Since the individual per-lane TL requests might come back out-of-sync between
+    // the lanes, but Vortex core expects the lane requests to be synced,
+    // we need to selectively fire responses that have the same source, and
+    // delay others.  Below is the logic that implements this.
+
+    // choose one source out of the arriving per-lane TL D channels
+    val arb = Module(
+      new RRArbiter(core.io.dmem.get.head.d.bits.source.cloneType, outer.numLanes)
+    )
+    val dmemTLBundles = outer.dmemNodes.map(_.out.head._1)
+    arb.io.out.ready := true.B
+    (arb.io.in zip dmemTLBundles).foreach { case (arbIn, tlBundle) =>
+      arbIn.valid := tlBundle.d.valid
+      arbIn.bits := tlBundle.d.bits.source
+    }
+    val matchingSources = Wire(UInt(outer.numLanes.W))
+    matchingSources := dmemTLBundles
+      .map(b => (b.d.bits.source === arb.io.out.bits) && arb.io.out.valid)
+      .asUInt
+
+    // connection: VortexBundle <--> VortexTLAdapter <--> dmemNodes
+    // @perf: this would duplicate SourceGenerator table for every lane and eat
+    // up some area
+    val dmemTLAdapters = Seq.tabulate(outer.numLanes) { _ =>
+      Module(new VortexTLAdapter(
+        outer.sourceWidth,
+        chiselTypeOf(core.io.dmem.get(0).a.bits),
+        chiselTypeOf(core.io.dmem.get(0).d.bits),
+        chiselTypeOf(dmemTLBundles.head.a.bits),
+        chiselTypeOf(dmemTLBundles.head.d.bits),
+      ))
+    }
+    (core.io.dmem.get zip dmemTLAdapters) foreach { case (coreMem, tlAdapter) =>
+      tlAdapter.io.inReq <> coreMem.a
+      coreMem.d <> tlAdapter.io.inResp
+    }
+    (dmemTLAdapters zip dmemTLBundles) foreach { case (tlAdapter, tlBundle) =>
+      tlBundle.a <> tlAdapter.io.outReq
+    }
+    // using the chosen source id,
     // - lie to core that response is not valid if source doesn't match picked
     // - lie to downstream that core is not ready if source doesn't match picked
-
-    val arb = Module(new RRArbiter(core.io.dmem.get.head.d.bits.source.cloneType, 4))
-    val matchingSources = Wire(UInt(4.W))
-    val dmemDs = outer.dmemNodes.map(_.out.head._1.d)
-
-    (arb.io.in zip dmemDs).zipWithIndex.foreach { case ((arbIn, tileNode), i) =>
-      arbIn.valid := tileNode.valid
-      arbIn.bits := tileNode.bits.source
-    }
-    matchingSources := dmemDs.map(d => (d.bits.source === arb.io.out.bits) && arb.io.out.valid).asUInt
-    arb.io.out.ready := true.B
-
-    (core.io.dmem.get zip dmemDs).zipWithIndex.foreach { case ((coreMem, tileNode), i) =>
-      coreMem.d.bits := tileNode.bits
-      coreMem.d.valid := tileNode.valid && matchingSources(i)
-      tileNode.ready := coreMem.d.ready && matchingSources(i)
+    (dmemTLAdapters zip dmemTLBundles).zipWithIndex.foreach {
+      case ((tlAdapter, tlBundle), i) =>
+        tlAdapter.io.outResp.bits := tlBundle.d.bits
+        tlAdapter.io.outResp.valid := tlBundle.d.valid && matchingSources(i)
+        tlBundle.d.ready := tlAdapter.io.outResp.ready && matchingSources(i)
     }
 
-    (core.io.dmem.get zip outer.dmemNodes).foreach { case (coreMem, tileNode) =>
-      coreMem.a <> tileNode.out.head._1.a
-    }
+    // (core.io.dmem.get zip outer.dmemNodes).foreach { case (coreMem, tileNode) =>
+    //   tileNode.out.head._1.a <> coreMem.a
+    // }
   }
 
   // core.io.fpu := DontCare
 
   // TODO eliminate this redundancy
   // val h = dcachePorts.size
-  //val c = core.dcacheArbPorts
+  // val c = core.dcacheArbPorts
   // val o = outer.nDCachePorts
   // require(h == c, s"port list size was $h, core expected $c")
   // require(h == o, s"port list size was $h, outer counted $o")
@@ -273,7 +381,61 @@ class VortexTileModuleImp(outer: VortexTile) extends BaseTileModuleImp(outer) {
   // dcacheArb.io.requestor <> dcachePorts.toSeq
 }
 
+// Some @copypaste from CoalescerSourceGen.
+class VortexTLAdapter(
+  newSourceWidth: Int,
+  inReqT: VortexBundleA,
+  inRespT: VortexBundleD,
+  outReqT: TLBundleA,
+  outRespT: TLBundleD
+) extends Module {
+  val io = IO(new Bundle {
+    // in/out means upstream/downstream
+    val inReq = Flipped(Decoupled(inReqT))
+    val outReq = Decoupled(outReqT)
+    val inResp = Decoupled(inRespT)
+    val outResp = Flipped(Decoupled(outRespT))
+  })
+  val sourceGen = Module(new SourceGenerator(
+    newSourceWidth,
+    Some(inReqT.source),
+    ignoreInUse = false
+  ))
+  sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created
+  sourceGen.io.reclaim.valid := io.outResp.fire
+  sourceGen.io.reclaim.bits := io.outResp.bits.source
+  sourceGen.io.meta := io.inReq.bits.source
+
+  // io passthrough logic
+  // TLBundleA <> VortexBundleA
+  io.outReq.valid := io.inReq.valid
+  io.outReq.bits.opcode := io.inReq.bits.opcode
+  io.outReq.bits.param := 0.U
+  io.outReq.bits.size := io.inReq.bits.size
+  io.outReq.bits.source := io.inReq.bits.source
+  io.outReq.bits.address := io.inReq.bits.address
+  io.outReq.bits.mask := io.inReq.bits.mask
+  io.outReq.bits.data := io.inReq.bits.data
+  io.outReq.bits.corrupt := 0.U
+  io.inReq.ready := io.outReq.ready
+  // VortexBundleD <> TLBundleD
+  io.inResp.valid := io.outResp.valid
+  io.inResp.bits.opcode := io.outResp.bits.opcode
+  io.inResp.bits.size := io.outResp.bits.size
+  io.inResp.bits.source := io.outResp.bits.source
+  io.inResp.bits.data := io.outResp.bits.data
+  io.outResp.ready := io.inResp.ready
+
+  // "man-in-the-middle"
+  io.inReq.ready := io.outReq.ready && sourceGen.io.id.valid
+  io.outReq.valid := io.inReq.valid && sourceGen.io.id.valid
+  io.outReq.bits.source := sourceGen.io.id.bits
+  // translate upstream response back to its old sourceId
+  io.inResp.bits.source := sourceGen.io.peek
+}
+
 // FIXME: unsure this is necessary
 trait HasFpuOpt { this: RocketTileModuleImp =>
-  val fpuOpt = outer.tileParams.core.fpu.map(params => Module(new FPU(params)(outer.p)))
+  val fpuOpt =
+    outer.tileParams.core.fpu.map(params => Module(new FPU(params)(outer.p)))
 }
diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 1b24c20..59c3882 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -254,41 +254,76 @@ case class CoalescedResponse(config: CoalescerConfig)
       dataWidth = (8 * (1 << config.maxCoalLogSize))
     )
 
-// If `ignoreInUse`, just keep giving out new IDs without checking if it is in
-// use.
-class SourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true)
-    extends Module {
+// `metadata` is an extra field in the sourceId table that can be used for
+// storing e.g. the UUID originally attached to a request.  This is useful for
+// using this module as a source ID converter / compressor.  If `None`, this
+// field is not instantiated.
+// TODO: implement lookup logic.
+//
+// If `ignoreInUse`, just keep giving out new IDs without any collision checking.
+// This might result in TL violation.
+class SourceGenerator[T <: Data](
+  sourceWidth: Int,
+  metadata: Option[T] = None,
+  ignoreInUse: Boolean = false
+) extends Module {
+  def getMetadataType = metadata match {
+    case Some(gen) => gen.cloneType
+    case None => UInt(0.W)
+  }
   val io = IO(new Bundle {
     val gen = Input(Bool())
     val reclaim = Input(Valid(UInt(sourceWidth.W)))
     val id = Output(Valid(UInt(sourceWidth.W)))
+    // below are used only when metadata is not None
+    // `meta` is used as input when a request succeeds id generation to store
+    // its value to the table.
+    // `peek` is the retrieved metadata saved for the request when corresponding
+    // request has come back, setting `reclaim`.
+    // Although these do not use ValidIO, it is safe because any in-flight
+    // response coming back should have allocated a valid entry in the table
+    // when it went out.
+    val meta = Input(getMetadataType)
+    val peek = Output(getMetadataType)
+    // for debugging; indicates whether there is at least one inflight request
+    // that hasn't been reclaimed yet
     val inflight = Output(Bool())
   })
   val head = RegInit(UInt(sourceWidth.W), 0.U)
   head := Mux(io.gen, head + 1.U, head)
 
-  // for debugging
-  // also for indicating if there is at least one inflight request that hasn't been reclaimed
   val outstanding = RegInit(UInt((sourceWidth + 1).W), 0.U)
   io.inflight := (outstanding > 0.U) || io.gen
 
   val numSourceId = 1 << sourceWidth
-  // true: in use, false: available
-  val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W)))
-  when(reset.asBool) {
-    (0 until numSourceId).foreach { occupancyTable(_).valid := false.B }
+  val row = new Bundle {
+    val meta = getMetadataType
+    val id = Valid(UInt(sourceWidth.W))
   }
-  val frees = (0 until numSourceId).map(!occupancyTable(_).valid)
+  // valid: in use, invalid: available
+  // val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W)))
+  val occupancyTable = Mem(numSourceId, row)
+  when(reset.asBool) {
+    (0 until numSourceId).foreach { occupancyTable(_).id.valid := false.B }
+  }
+  val frees = (0 until numSourceId).map(!occupancyTable(_).id.valid)
   val lowestFree = PriorityEncoder(frees)
   val lowestFreeRow = occupancyTable(lowestFree)
 
-  io.id.valid := (if (ignoreInUse) true.B else !lowestFreeRow.valid)
+  io.id.valid := (if (ignoreInUse) true.B else !lowestFreeRow.id.valid)
   io.id.bits := lowestFree
   when(io.gen && io.id.valid /* fire */ ) {
-    occupancyTable(io.id.bits).valid := true.B // mark in use
+    occupancyTable(io.id.bits).id.valid := true.B // mark in use
+    if (metadata.isDefined) {
+      occupancyTable(io.id.bits).meta := io.meta
+    }
   }
   when(io.reclaim.valid) {
-    occupancyTable(io.reclaim.bits).valid := false.B // mark freed
+    // @perf: would this require multiple write ports?
+    occupancyTable(io.reclaim.bits).id.valid := false.B // mark freed
+  }
+  io.peek := {
+    if (metadata.isDefined) occupancyTable(io.reclaim.bits).meta else 0.U
   }
 
   when(io.gen && io.id.valid) {
@@ -300,7 +335,6 @@ class SourceGenerator(sourceWidth: Int, ignoreInUse: Boolean = true)
     assert(outstanding > 0.U)
     outstanding := outstanding - 1.U
   }
-
   dontTouch(outstanding)
 }
 
@@ -738,30 +772,41 @@ class MultiCoalescer(
   if (!config.enable) disable
 }
 
+// This module mostly handles the correct ready/valid handshake depending on
+// sourceId availability.  Actual generation logic is done by the
+// SourceGenerator module.
 class CoalescerSourceGen(
     config: CoalescerConfig,
     coalReqT: CoalescedRequest,
     respT: TLBundleD
 ) extends Module {
   val io = IO(new Bundle {
+    // in/out means upstream/downstream
     val inReq = Flipped(Decoupled(coalReqT.cloneType))
     val outReq = Decoupled(coalReqT.cloneType)
-    val inResp = Flipped(Decoupled(respT.cloneType))
+    // outResp is only needed for telling the downstream TL node that this
+    // sourcegen module is always ready to take in responses.
+    // No need for inResp, since coalescerNode is directly replied by the
+    // outResp TileLink bundle.
+    val outResp = Flipped(Decoupled(respT.cloneType))
   })
   val sourceGen = Module(
     new SourceGenerator(log2Ceil(config.numNewSrcIds), ignoreInUse = false)
   )
   sourceGen.io.gen := io.outReq.fire // use up a source ID only when request is created
-  sourceGen.io.reclaim.valid := io.inResp.fire
-  sourceGen.io.reclaim.bits := io.inResp.bits.source
-  io.inResp.ready := true.B // should be always ready to reclaim old ID
+  sourceGen.io.reclaim.valid := io.outResp.fire
+  sourceGen.io.reclaim.bits := io.outResp.bits.source
+  sourceGen.io.meta := DontCare
   // TODO: make sourceGen.io.reclaim Decoupled?
 
+  // passthrough logic
   io.outReq <> io.inReq
+  // "man-in-the-middle"
   io.inReq.ready := io.outReq.ready && sourceGen.io.id.valid
   // overwrite bits affected by sourcegen backpressure
   io.outReq.valid := io.inReq.valid && sourceGen.io.id.valid
   io.outReq.bits.source := sourceGen.io.id.bits
+  io.outResp.ready := true.B // should be always ready to reclaim old ID
 }
 
 class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
@@ -880,7 +925,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
   //
   val coalSourceGen = Module(new CoalescerSourceGen(config, coalReqT, tlCoal.d.bits))
   coalSourceGen.io.inReq <> coalescer.io.coalReq
-  coalSourceGen.io.inResp <> tlCoal.d
+  coalSourceGen.io.outResp <> tlCoal.d
 
   // InflightTable IO
   //
@@ -1468,6 +1513,7 @@ class MemTraceDriverImp(
     // assert(sourceGen.io.id.valid)
     sourceGen.io.reclaim.valid := tlOut.d.fire
     sourceGen.io.reclaim.bits := tlOut.d.bits.source
+    sourceGen.io.meta := DontCare
 
     val (plegal, pbits) = edge.Put(
       fromSource = sourceGen.io.id.bits,
@@ -2203,7 +2249,7 @@ class CoalescerXbarImpl(outer: CoalescerXbar,
     // For the uncoalesced data response
     (outer.nonCoalNarrowNodes zip io.nonCoalResps).foreach{
       case(node,resp) => 
-        val (tlOut, edgeOut)  = node.out(0)
+        val (tlOut, _)  = node.out(0)
         val nonCoalResp = Wire(respNonCoalEntryT)
         nonCoalResp.fromTLD(tlOut.d.bits)
         tlOut.d.ready  := resp.ready
@@ -2219,7 +2265,7 @@ class CoalescerXbarImpl(outer: CoalescerXbar,
                                   )
     outer.coalReqNodes.zipWithIndex.foreach{
       case(node, idx) =>
-        val (tlOut, edgeOut)  = node.out(0)
+        val (tlOut, _)  = node.out(0)
         coalRespRRArbiter.io.in(idx) <> tlOut.d
     }
     //Connect output of arbiter to coalesced reponse output