diff --git a/radiance.mk b/radiance.mk
index d5a5552..5eddf00 100644
--- a/radiance.mk
+++ b/radiance.mk
@@ -14,7 +14,6 @@ EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradp
 EXTRA_SIM_PREPROC_DEFINES += \
 	+define+SIMULATION \
 	+define+GPR_RESET \
-	+define+GPR_DUPLICATED \
 	+define+LSU_DUP_DISABLE \
 	+define+DBG_TRACE_CORE_PIPELINE_VCS \
 	+define+PERF_ENABLE \
diff --git a/src/main/scala/radiance/memory/VortexCache.scala b/src/main/scala/radiance/memory/VortexCache.scala
index 61a841c..0c95eb4 100644
--- a/src/main/scala/radiance/memory/VortexCache.scala
+++ b/src/main/scala/radiance/memory/VortexCache.scala
@@ -10,18 +10,18 @@ import org.chipsalliance.cde.config.{Parameters, Field}
 case object VortexL1Key extends Field[Option[VortexL1Config]](None /*default*/ )
 
 case class VortexL1Config(
-    cacheSize: Int, // total cache size in bytes
-    numBanks: Int,
-    wordSize: Int, // This is the read/write granularity of the L1 cache
-    cacheLineSize: Int,
-    coreTagWidth: Int,
-    writeInfoReqQSize: Int,
-    mshrSize: Int,
-    memSideSourceIds: Int,
-    uncachedAddrSets: Seq[AddressSet]
+                           cacheSize: Int, // total cache size in bytes
+                           numBanks: Int,
+                           inputSize: Int, // This is the read/write granularity of the L1 cache
+                           cacheLineSize: Int,
+                           coreTagWidth: Int,
+                           writeInfoReqQSize: Int,
+                           mshrSize: Int,
+                           memSideSourceIds: Int,
+                           uncachedAddrSets: Seq[AddressSet]
 ) {
   def coreTagPlusSizeWidth: Int = {
-    log2Ceil(wordSize) + coreTagWidth
+    log2Ceil(inputSize) + coreTagWidth
   }
   // NOTE: This assertion depends on the fact that the Vortex cache is
   // configured to have 1 bank, and that it uses MSHR id as the tag of
@@ -37,7 +37,7 @@ object defaultVortexL1Config
     extends VortexL1Config(
       cacheSize = 16384,
       numBanks = 4,
-      wordSize = 16,
+      inputSize = 16,
       cacheLineSize = 16,
       coreTagWidth = 8,
       writeInfoReqQSize = 16,
@@ -80,15 +80,15 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
   // Slave node to upstream
   val managerParam = Seq(
     TLSlavePortParameters.v1(
-      beatBytes = config.wordSize,
+      beatBytes = config.inputSize,
       managers = Seq(
         TLSlaveParameters.v1(
           address = config.uncachedAddrSets,
           regionType = RegionType.IDEMPOTENT,
           executable = false,
-          supportsGet = TransferSizes(1, config.wordSize),
-          supportsPutPartial = TransferSizes(1, config.wordSize),
-          supportsPutFull = TransferSizes(1, config.wordSize),
+          supportsGet = TransferSizes(1, config.inputSize),
+          supportsPutPartial = TransferSizes(1, config.inputSize),
+          supportsPutFull = TransferSizes(1, config.inputSize),
           fifoId = Some(0)
         )
       )
@@ -107,10 +107,10 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
               config.memSideSourceIds
             ) + 5 /*FIXME: give more sourceId so that passthrough doesn't block; hacky*/ )
           ),
-          supportsProbe = TransferSizes(1, config.wordSize),
-          supportsGet = TransferSizes(1, config.wordSize),
-          supportsPutFull = TransferSizes(1, config.wordSize),
-          supportsPutPartial = TransferSizes(1, config.wordSize)
+          supportsProbe = TransferSizes(1, config.cacheLineSize),
+          supportsGet = TransferSizes(1, config.cacheLineSize),
+          supportsPutFull = TransferSizes(1, config.cacheLineSize),
+          supportsPutPartial = TransferSizes(1, config.cacheLineSize)
         )
       )
     )
@@ -141,8 +141,8 @@ class VortexBank(
     // suppose have 4 bank
     // base for bank 1: ...000000|01|0000
     // mask for bank 1;    111111|00|1111
-    val base = 0x00000000L | (bankId * config.wordSize)
-    val mask = 0xffffffffL ^ ((config.numBanks - 1) * config.wordSize)
+    val base = 0x00000000L | (bankId * config.inputSize)
+    val mask = 0xffffffffL ^ ((config.numBanks - 1) * config.inputSize)
 
     val excludeSets = config.uncachedAddrSets
     var remainingSets: Seq[AddressSet] = Seq(AddressSet(base, mask))
@@ -155,15 +155,15 @@ class VortexBank(
   // Slave node to upstream
   val managerParam = Seq(
     TLSlavePortParameters.v1(
-      beatBytes = config.wordSize,
+      beatBytes = config.inputSize,
       managers = Seq(
         TLSlaveParameters.v1(
           address = generateAddressSets(),
           regionType = RegionType.IDEMPOTENT, // idk what this does
           executable = false,
-          supportsGet = TransferSizes(1, config.wordSize),
-          supportsPutPartial = TransferSizes(1, config.wordSize),
-          supportsPutFull = TransferSizes(1, config.wordSize),
+          supportsGet = TransferSizes(1, config.inputSize),
+          supportsPutPartial = TransferSizes(1, config.inputSize),
+          supportsPutFull = TransferSizes(1, config.inputSize),
           fifoId = Some(0)
         )
       )
@@ -177,10 +177,10 @@ class VortexBank(
         TLMasterParameters.v1(
           name = s"VortexBank${bankId}",
           sourceId = IdRange(0, config.memSideSourceIds),
-          supportsProbe = TransferSizes(1, config.wordSize),
-          supportsGet = TransferSizes(1, config.wordSize),
-          supportsPutFull = TransferSizes(1, config.wordSize),
-          supportsPutPartial = TransferSizes(1, config.wordSize)
+          supportsProbe = TransferSizes(1, config.inputSize),
+          supportsGet = TransferSizes(1, config.inputSize),
+          supportsPutFull = TransferSizes(1, config.inputSize),
+          supportsPutPartial = TransferSizes(1, config.inputSize)
         )
       )
     )
@@ -204,7 +204,7 @@ class VortexBankImp(
 ) extends LazyModuleImp(outer) {
   val vxCache = Module(
     new VX_cache_top(
-      WORD_SIZE = config.wordSize,
+      WORD_SIZE = config.inputSize,
       // distribute total size across numBanks
       CACHE_SIZE = config.cacheSize / config.numBanks,
       CACHE_LINE_SIZE = config.cacheLineSize,
@@ -236,7 +236,7 @@ class VortexBankImp(
   }
 
   class ReadReqInfo(config: VortexL1Config) extends Bundle {
-    val size = UInt(log2Ceil(config.wordSize).W)
+    val size = UInt(log2Ceil(config.inputSize + 1).W)
     val id = UInt(config.coreTagWidth.W)
   }
 
@@ -264,7 +264,7 @@ class VortexBankImp(
     // 4 is also hardcoded, it should be log2WordSize
     vxCache.io.core_req_addr := tlInFromCoal.a.bits.address(
       31,
-      log2Ceil(config.wordSize)
+      log2Ceil(config.inputSize)
     )
     vxCache.io.core_req_byteen := tlInFromCoal.a.bits.mask
     vxCache.io.core_req_data := tlInFromCoal.a.bits.data
@@ -362,17 +362,17 @@ class VortexBankImp(
       TLMessages.Get
     )
 
-    tlOutToL2.a.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W))
+    tlOutToL2.a.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(log2Ceil(config.cacheLineSize).W))
     tlOutToL2.a.bits.mask := Mux(
       vxCache.io.mem_req_rw,
       vxCache.io.mem_req_byteen,
-      0xffff.U
+      ~(0.U(config.cacheLineSize.W))
     )
     tlOutToL2.a.bits.data := vxCache.io.mem_req_data
     tlOutToL2.a.bits.source := sourceGen.io.id.bits
     // ignore param, size, corrupt fields
     tlOutToL2.a.bits.param := 0.U
-    tlOutToL2.a.bits.size := 4.U // FIXME: hardcoded
+    tlOutToL2.a.bits.size := log2Ceil(config.cacheLineSize).U
     tlOutToL2.a.bits.corrupt := false.B
     // downstream L2 -> vxCache response
     tlOutToL2.d.ready := vxCache.io.mem_rsp_ready
diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala
index a8acd25..1a576cb 100644
--- a/src/main/scala/radiance/subsystem/Configs.scala
+++ b/src/main/scala/radiance/subsystem/Configs.scala
@@ -126,7 +126,7 @@ class WithFuzzerCores(
 class WithRadianceCluster(
   clusterId: Int,
   location: HierarchicalLocation = InSubsystem,
-  crossing: RocketCrossingParams = RocketCrossingParams() // TODO make this not rocket
+  crossing: RocketCrossingParams = RocketCrossingParams()
 ) extends Config((site, here, up) => {
   case ClustersLocated(`location`) => up(ClustersLocated(location)) :+ RadianceClusterAttachParams(
     RadianceClusterParams(clusterId = clusterId),
@@ -174,7 +174,17 @@ class WithPriorityCoalXbar extends Config((site, _, up) => {
 
 class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
   case VortexL1Key => {
-    Some(defaultVortexL1Config.copy(numBanks = nBanks))
+    Some(defaultVortexL1Config.copy(
+      numBanks = nBanks,
+      inputSize = up(SIMTCoreKey).get.nMemLanes * 4,
+      cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4,
+      memSideSourceIds = 64,
+      mshrSize = 64,
+      coreTagWidth = log2Ceil(up(SIMTCoreKey).get.nSrcIds.max(up(CoalescerKey) match {
+        case Some(key) => key.numNewSrcIds
+        case None => 0
+      })) + log2Ceil(up(SIMTCoreKey).get.nMemLanes) + 1
+    ))
   }
 })
 
@@ -197,8 +207,7 @@ class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config
     // If instantiating L1 cache, the maximum coalescing size should match the
     // cache line size
     val maxCoalSizeInBytes = up(VortexL1Key, site) match {
-      case Some(param) =>
-        (param.wordSize) 
+      case Some(param) => param.inputSize
       case None => sbusWidthInBytes
     }
       
diff --git a/src/main/scala/radiance/tile/GemminiTile.scala b/src/main/scala/radiance/tile/GemminiTile.scala
index ccb7515..897a314 100644
--- a/src/main/scala/radiance/tile/GemminiTile.scala
+++ b/src/main/scala/radiance/tile/GemminiTile.scala
@@ -140,6 +140,9 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
 
   tieOffGemminiRocc
 
+  outer.traceSourceNode.bundle := DontCare
+  outer.traceSourceNode.bundle.insns foreach (_.valid := false.B)
+
   // hacky, but cluster will AND the cease signals from all tiles, and we want
   // the core tiles to determine cluster cease not Gemmini
   outer.reportCease(Some(true.B))
diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala
index e5d1e10..cb50a32 100644
--- a/src/main/scala/radiance/tile/RadianceCluster.scala
+++ b/src/main/scala/radiance/tile/RadianceCluster.scala
@@ -8,6 +8,7 @@ import chisel3.util._
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.prci.ClockSinkParameters
 import freechips.rocketchip.subsystem._
+import freechips.rocketchip.tile.TraceBundle
 import freechips.rocketchip.tilelink._
 import gemmini._
 import org.chipsalliance.cde.config.Parameters
@@ -91,7 +92,7 @@ class RadianceCluster (
       callback(p)
     }
   }
-  def connect_one[T <: BaseNode with TLNode](from: TLNode, to: () => T): T = {
+  def connect_one[T <: TLNode](from: TLNode, to: () => T): T = {
     val t = to()
     guard_monitors { implicit p => t := from }
     t
@@ -183,13 +184,18 @@ class RadianceCluster (
 
     val spad_read_nodes = Seq.fill(smem_banks) {
       val r_dist = DistributorNode(from = smem_width, to = wordSize)
-      guard_monitors { implicit p => r_dist := gemmini.spad_read_nodes }
+      guard_monitors { implicit p => r_dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := gemmini.spad_read_nodes }
       Seq.fill(smem_subbanks) { connect_one(r_dist, TLIdentityNode.apply) }
     }
     val spad_write_nodes = Seq.fill(smem_banks) {
       val w_dist = DistributorNode(from = smem_width, to = wordSize)
-      guard_monitors { implicit p => w_dist := gemmini.spad_write_nodes }
+      guard_monitors { implicit p => w_dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := gemmini.spad_write_nodes }
       Seq.fill(smem_subbanks) { connect_one(w_dist, TLIdentityNode.apply) }
+      /* Seq.fill(smem_subbanks) {
+        val buf = TLBuffer(BufferParams(1, false, true), BufferParams(0))
+        buf := w_dist
+        buf
+      } */
     }
     val ws_dist = DistributorNode(from = smem_width, to = wordSize)
     guard_monitors { implicit p => ws_dist := gemmini.spad.spad_writer.node } // this is the dma write node
diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala
index ea377cb..2cfcc5c 100644
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -165,10 +165,14 @@ class RadianceTile private (
   // to a stall in the backend pipeline and resulting in a deadlock.
   val imemSourceWidth = 4 // 1 << imemSourceWidth == IBUF_SIZE
 
-  val dmemSourceWidth = p(SIMTCoreKey) match {
-    // TODO: respect coalescer newSrcIds
+  val smemSourceWidth = p(SIMTCoreKey) match {
     case Some(simtParam) => log2Ceil(simtParam.nSrcIds)
-    case None            => 4
+    case None => 4
+  }
+
+  val dmemSourceWidth = p(CoalescerKey) match {
+    case Some(coalParam) => log2Ceil(coalParam.numOldSrcIds)
+    case None => smemSourceWidth
   }
   // require(
   //   dmemSourceWidth >= 4,
@@ -177,8 +181,6 @@ class RadianceTile private (
   //     "We recommend setting nSrcIds to at least 16."
   // )
 
-  val smemSourceWidth = 4 // FIXME: hardcoded
-
   // Replicates some of the logic of how Vortex determines the tag width of
   // memory requests so that Chisel and Verilog are in agreement on bitwidths.
   // See VX_gpu_pkg.sv
@@ -190,7 +192,8 @@ class RadianceTile private (
   }
   val imemTagWidth = UUID_WIDTH + NW_WIDTH
 
-  val LSUQ_SIZE = 2 * numWarps * (numCoreLanes / numLsuLanes)
+  val LSUQ_SIZE = 8 * numWarps * (numCoreLanes / numLsuLanes)
+  assert(LSUQ_SIZE == p(SIMTCoreKey).get.nSrcIds)
   val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/
   val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS
   // dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH