diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 0654df3..154a3cf 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -155,8 +155,8 @@ class TensorCoreDecoupled(
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   Seq((io.reqA, (io.respA, respATagged)),
-      (io.reqB, (io.respB, respBTagged))).foreach {
-    case (req, (resp, respTagged)) => {
+      (io.reqB, (io.respB, respBTagged))).zipWithIndex.foreach {
+    case ((req, (resp, respTagged)), i) => {
       val sourceGen = Module(new SourceGenerator(
         log2Ceil(numSourceIds),
         metadata = Some(tag)
@@ -165,7 +165,9 @@ class TensorCoreDecoupled(
       sourceGen.io.gen := req.fire
       sourceGen.io.meta := tag
       req.valid := genReq
-      req.bits.address := 0.U // FIXME
+      // FIXME: bogus address
+      // req.bits.address := (if (i == 0) 0.U else 0x100.U) // avoids bank conflict for A and B
+      req.bits.address := 0.U
       req.bits.source := sourceGen.io.id.bits
 
       sourceGen.io.reclaim.valid := resp.fire
@@ -270,7 +272,8 @@ class TensorCoreDecoupled(
   fullAQueue.io.deq.ready := dpuFire && (substepCompute === 1.U)
   val nextStepExecute = dpuFire && (substepCompute === 1.U)
 
-  // make sure to dequeue from response queues only when both A and B valid
+  // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
+  // on the substep
   respQueueA.ready := MuxCase(false.B,
                               Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
                                   (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
@@ -446,10 +449,35 @@ class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
   }
 }
 
+// two separate TLRAMs for A and B for full throughput
+class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
+  val tensor = LazyModule(new TensorCoreDecoupledTL)
+  val xbar = LazyModule(new TLXbar)
+  val ramA = LazyModule(new TLRAM(
+    address = AddressSet(0x000, 0xfffeff),
+    beatBytes = 32 // FIXME: hardcoded
+  ))
+  val ramB = LazyModule(new TLRAM(
+    address = AddressSet(0x100, 0xfffeff),
+    beatBytes = 32 // FIXME: hardcoded
+  ))
+
+  xbar.node :=* tensor.node
+  ramA.node := xbar.node
+  ramB.node := xbar.node
+
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with UnitTestModule {
+    tensor.module.io.start := io.start
+    io.finished := tensor.module.io.finished
+  }
+}
+
 // unit test harness
 class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters)
     extends UnitTest(timeout) {
-  val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
+  // val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
+  val dut = Module(LazyModule(new TensorCoreDecoupledTwoTLRAM).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
 }