diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 65246f6..43dc1ca 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -32,8 +32,8 @@ class TensorCoreDecoupled(
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
-  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
+  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
 
   val io = IO(new Bundle {
     val initiate = Flipped(Decoupled(new Bundle {
@@ -51,6 +51,27 @@ class TensorCoreDecoupled(
   })
   dontTouch(io)
 
+  class TensorMemReq(
+    sourceWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val address = UInt(32.W)
+  }
+  class TensorMemResp(
+    sourceWidth: Int,
+    dataWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val data = UInt(dataWidth.W)
+  }
+  // mem response after translation from TL source to set/step tag
+  class TensorMemRespWithTag(
+    dataWidth: Int
+  ) extends Bundle {
+    val tag = new TensorMemTag
+    val data = UInt(dataWidth.W)
+  }
+
   // FSM
   // ---
   // This drives the overall pipeline of memory requests, dot-product unit
@@ -101,18 +122,39 @@ class TensorCoreDecoupled(
   //
   val genReq = (state === TensorState.run)
 
-  Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach {
-    case (req, resp) => {
-      val sourceGen = Module(new SourceGenerator(log2Ceil(numSourceIds)))
+  class TensorMemTag extends Bundle {
+    val set = UInt(setBits.W)
+    val step = UInt(stepBits.W)
+  }
+  // use concatenation of set/step as the memory request source.  This will get
+  // translated to the actual TL sourcewidth in sourceGen.
+  val tag = Wire(new TensorMemTag)
+  tag.set := set
+  tag.step := step
+
+  val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
+  val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
+  Seq((io.reqA, (io.respA, respATagged)), (io.reqB, (io.respB, respBTagged))).foreach {
+    case (req, (resp, respTagged)) => {
+      val sourceGen = Module(new SourceGenerator(
+        log2Ceil(numSourceIds),
+        metadata = Some(tag)
+      ))
 
       sourceGen.io.gen := req.fire
-      sourceGen.io.meta := DontCare
+      sourceGen.io.meta := tag
       req.valid := genReq
       req.bits.address := 0.U // FIXME
       req.bits.source := sourceGen.io.id.bits
 
       sourceGen.io.reclaim.valid := resp.fire
       sourceGen.io.reclaim.bits := resp.bits.source
+
+      // translate source
+      respTagged.valid := resp.valid
+      respTagged.bits.tag := sourceGen.io.peek
+      respTagged.bits.data := resp.bits.data
+      resp.ready := respTagged.ready
     }
   }
 
@@ -130,16 +172,13 @@ class TensorCoreDecoupled(
     firedABReg := Seq(false.B, false.B)
   }
 
-  io.respA.ready := true.B // FIXME
-  io.respB.ready := true.B // FIXME
-
   // Execute stage
   // -------------
   // Backend of the decoupled access/execute pipeline.
   //
   val respQueueDepth = 4 // FIXME: parameterize
-  val respQueueA = Queue(io.respA, respQueueDepth)
-  val respQueueB = Queue(io.respB, respQueueDepth)
+  val respQueueA = Queue(respATagged, respQueueDepth)
+  val respQueueB = Queue(respBTagged, respQueueDepth)
   respQueueA.ready := io.writeback.ready // FIXME
   respQueueB.ready := io.writeback.ready // FIXME
 
@@ -149,9 +188,11 @@ class TensorCoreDecoupled(
 
   // FIXME: debug dummy: pipe A directly to writeback
   io.writeback.valid := respQueueA.valid
-  val groupedRespA = respQueueA.bits.data.asBools.grouped(wordSize * 8/*bits*/)
+  val groupedRespA = respQueueA.bits.data
+                     .asBools.grouped(wordSize * 8/*bits*/)
+                     .map(VecInit(_).asUInt)
   (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) =>
-    wb := VecInit(data).asUInt
+    wb := data
   }
 
   // State transition
@@ -204,20 +245,6 @@ class TensorCoreDecoupled(
   // val rdQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
 }
 
-class TensorMemReq(
-  sourceWidth: Int
-) extends Bundle {
-  val source = UInt(sourceWidth.W)
-  val address = UInt(32.W)
-}
-class TensorMemResp(
-  sourceWidth: Int,
-  dataWidth: Int
-) extends Bundle {
-  val source = UInt(sourceWidth.W)
-  val data = UInt(dataWidth.W)
-}
-
 // synthesizable unit tests
 
 // wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy