diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc
index 08540bc..6c08858 100644
--- a/src/main/resources/csrc/SimMemTrace.cc
+++ b/src/main/resources/csrc/SimMemTrace.cc
@@ -22,7 +22,9 @@ MemTraceReader::MemTraceReader(const std::string &filename)
 
   infile.open(filename);
   if (infile.fail()) {
-    fprintf(stderr, "failed to open file %s\n", filename.c_str());
+    fprintf(stderr, "MemTraceReader: error: failed to open file %s\n",
+            filename.c_str());
+    exit(EXIT_FAILURE);
   }
 }
 
@@ -60,8 +62,6 @@ void MemTraceReader::parse(const bool has_source) {
     }
 
     if (!(infile >> line.cycle >> loadstore >> line.core_id >> line.lane_id)) {
-      printf("char=[%c]\n", infile.peek());
-      // assert(!infile.eof());
       error(fileline, "failed parsing cycle..lane_id");
     }
     if (has_source && !(infile >> source)) {
@@ -101,8 +101,6 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id,
   MemTraceLine line;
   line.valid = false;
 
-  // printf("tick(): cycle=%ld\n", cycle);
-
   if (finished()) {
     return line;
   }
@@ -112,7 +110,11 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id,
   // the next line is in the future.
   if (line.cycle < cycle) {
     long fileline = read_pos - std::cbegin(trace_buf) + 1;
-    error(fileline, "some trace lines are left unread in the past");
+    error(fileline, "some trace lines are left unread in the past. "
+                    "Tried cycle=" +
+                        std::to_string(cycle) +
+                        ", found line.cycle=" + std::to_string(line.cycle) +
+                        ". Is NUM_LANES set correctly?");
     return MemTraceLine{};
   }
 
@@ -134,14 +136,17 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id,
       // monotonically increment read_pos.  lane_id need not be contiguous, e.g.
       // 0->1->3 is fine.
       ++read_pos;
-      return line;
     } else {
       // For debugging purposes, instead of early-returning on
       // !trace_read_ready, print something to notify we are blocking a valid
       // trace line.
       printf("All Lanes Blocked on this cycle! cycle=%ld \n", cycle);
-      return MemTraceLine{};
     }
+    // We want to return valid line regardless of `trace_read_ready` or not,
+    // because we want to let the driver know that it missed a valid line at the
+    // given cycle, so that it holds its cycle counter and safely reads back the
+    // line in the future.
+    return line;
   }
 
   assert(!"unreachable");
diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v
index 4d630fd..74594cb 100644
--- a/src/main/resources/vsrc/SimMemTrace.v
+++ b/src/main/resources/vsrc/SimMemTrace.v
@@ -39,49 +39,32 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) (
   output [`DATA_WIDTH*NUM_LANES-1:0]    trace_read_data,
   output                                trace_read_finished
 );
-  bit     __in_valid   [NUM_LANES-1:0];
-  longint __in_address [NUM_LANES-1:0];
-
-  bit     __in_is_store [NUM_LANES-1:0];
+  bit                      __in_valid   [NUM_LANES-1:0];
+  longint                  __in_address [NUM_LANES-1:0];
+  bit                      __in_is_store [NUM_LANES-1:0];
   reg [`LOGSIZE_WIDTH-1:0] __in_size [NUM_LANES-1:0];
-  longint __in_data [NUM_LANES-1:0];
-
-  bit __in_finished;
-  string __uartlog;
-
-  // Cycle counter that is used to query C parser whether we have a request
-  // coming in at the current cycle.
-
-
-  // registers that stage outputs of the C parser
-  reg [NUM_LANES-1:0]   __in_valid_wire;
-  reg [`DATA_WIDTH-1:0] __in_address_wire [NUM_LANES-1:0];
-
-  reg [NUM_LANES-1:0]      __in_is_store_wire;
-  reg [`LOGSIZE_WIDTH-1:0] __in_size_wire [NUM_LANES-1:0];
-  reg [`DATA_WIDTH-1:0]    __in_data_wire [NUM_LANES-1:0];
-  reg                      __in_finished_wire;
+  longint                  __in_data [NUM_LANES-1:0];
+  bit                      __in_finished;
 
   genvar g;
-
   generate
     for (g = 0; g < NUM_LANES; g = g + 1) begin
-      assign trace_read_valid[g] = __in_valid_wire[g];
-      assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g]  = __in_address_wire[g];
+      assign trace_read_valid[g] = __in_valid[g];
+      assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g]  = __in_address[g];
 
-      assign trace_read_is_store[g] = __in_is_store_wire[g];
-      assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size_wire[g];
-      assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_wire[g];
+      assign trace_read_is_store[g] = __in_is_store[g];
+      assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size[g];
+      assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data[g];
     end
   endgenerate
-  assign trace_read_finished = __in_finished_wire;
+  assign trace_read_finished = __in_finished;
 
   initial begin
       /* $value$plusargs("uartlog=%s", __uartlog); */
       memtrace_init(FILENAME);
   end
 
-  always @(*) begin
+  always @(posedge clock) begin
     if (reset) begin
       for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
         __in_valid[tid] = 1'b0;
@@ -91,55 +74,29 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) (
         __in_size[tid] = `LOGSIZE_WIDTH'b0;
         __in_data[tid] = `DATA_WIDTH'b0;
       end
-
       __in_finished = 1'b0;
-
-      //cycle_counter <= `DATA_WIDTH'b0;
-
-      // setting default value for register to avoid latches
-      for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
-        __in_valid_wire[tid] = 1'b0;
-        __in_address_wire[tid] = `DATA_WIDTH'b0;
-
-        __in_is_store_wire[tid] = 1'b0;
-        __in_size_wire[tid] = `LOGSIZE_WIDTH'b0;
-        __in_data_wire[tid] = `DATA_WIDTH'b0;
-      end
-
-      __in_finished_wire = 1'b0;
     end else begin
+      // We have to write to __in_ regs only when trace_read_ready, or
+      // otherwise we might overwrite lines that were previously valid
+      // but the downstream missed by being not ready.
+      if (trace_read_ready) begin
+        for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
+          memtrace_query(
+            trace_read_ready,
+            trace_read_cycle,
+            tid,
 
-      // Getting values from C function into pseudeo register
-      for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
-        memtrace_query(
-          trace_read_ready,
-          // Since parsed results are latched to the output on the next
-          // cycle due to staging registers, we need to pass in the next cycle
-          // to sync up.
-          trace_read_cycle,  // the left replace next_cycle_counter,
-          tid,
+            __in_valid[tid],
+            __in_address[tid],
 
-          __in_valid[tid],
-          __in_address[tid],
- 
-          __in_is_store[tid],
-          __in_size[tid],
-          __in_data[tid],
+            __in_is_store[tid],
+            __in_size[tid],
+            __in_data[tid],
 
-          __in_finished
-        );
+            __in_finished
+          );
+        end
       end
-
-      // Connect values from pseudo register into verilog register 
-      for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
-        __in_valid_wire[tid]   = __in_valid[tid];
-        __in_address_wire[tid] = __in_address[tid];
-
-        __in_is_store_wire[tid] = __in_is_store[tid];
-        __in_size_wire[tid] = __in_size[tid];
-        __in_data_wire[tid] = __in_data[tid];
-      end
-      __in_finished_wire = __in_finished;
     end
   end
 endmodule
diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 90237b7..8ded755 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -233,11 +233,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
   }))
 
   // shift hint is when the heads have no more coalescable left this or next cycle
-  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, i) =>
-    c && !(io.invalidate.valid && i)
+  val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) =>
+    c && !(io.invalidate.valid && inv)
   }.reduce(_ || _)
   val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
-  val syncedDeqValid = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire
+  // syncedDeqValidNextCycle being true means the arbiter has completed
+  // processing all of the ready-to-go requests.
+  val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire
 
   for (i <- 0 until config.numLanes) {
     val enq = io.queue.enq(i)
@@ -247,7 +249,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
     ctrl.full := writePtr(i) === entries.U
     ctrl.empty := writePtr(i) === 0.U
     // shift when no outstanding dequeue, no more coalescable chunks, and not empty
-    ctrl.shift := !syncedDeqValid && shiftHint && !ctrl.empty
+    ctrl.shift := !syncedDeqValidNextCycle && shiftHint && !ctrl.empty
 
     // dequeue is valid when:
     // head entry is valid, has not been processed by downstream, and is not coalescable
@@ -293,6 +295,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
     }
   }
 
+  // When doing spatial-only coalescing, queues should never drift from each
+  // other, i.e. the queue heads should always contain mem requests from the
+  // same instruction.
   val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
     writePtr.map(_ === writePtr.head).reduce(_ && _)
   assert(queueInSync, "shift queue lanes are not in sync")
@@ -326,23 +331,15 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
   val leaders = io.window.elts.map(_.head)
   val leadersValid = io.window.mask.map(_.asBools.head)
 
-  // When doing spatial-only coalescing, queues should never drift from each
-  // other, i.e. the queue heads should always contain mem requests from the
-  // same instruction.
-  // FIXME: This relies on the MemTraceDriver's behavior of generating TL
-  // requests with full source info even when the corresponding lane is not
-  // active.
-  def testNoQueueDrift: Bool = leaders.map(_.source === leaders.head.source).reduce(_ || _)
   def printQueueHeads = {
     leaders.zipWithIndex.foreach{ case (head, i) =>
       printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
         leadersValid(i), head.source, head.address)
     }
   }
-  when (leadersValid.reduce(_ || _)) {
-    assert(testNoQueueDrift, "unexpected drift between lane request queues")
-    // printQueueHeads
-  }
+  // when (leadersValid.reduce(_ || _)) {
+  //   printQueueHeads
+  // }
 
   val size = coalLogSize
   val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
@@ -375,14 +372,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
            .reduce(_ +& _))
   val canCoalesce = matchCounts.map(_ > 1.U)
 
-  // Elect the leader out of all potential leaders that have matchCounts > 1.
+  // Elect the leader that has the most match counts.
   // TODO: potentially expensive: magnitude comparator
-  // Maybe choose leftmost leader (priority encoder) instead of argmax
-  val chosenLeaderIdx = matchCounts.zipWithIndex.map {
-    case (c, i) => (c, i.U)
-  }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
-    (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
-  }._2
+  def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
+    matchCounts.zipWithIndex.map {
+      case (c, i) => (c, i.U)
+    }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
+        (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
+    }._2
+  }
+  // Elect leader by choosing the smallest-index lane that has a valid
+  // match, i.e. using priority encoder.
+  def chooseLeaderPriorityEncoder(matchCounts: Seq[UInt]): UInt = {
+    PriorityEncoder(matchCounts.map(_ > 1.U))
+  }
+  val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts)
 
   val chosenLeader = VecInit(leaders)(chosenLeaderIdx)
   // matchTable for the chosen lane, but converted to a Vec[UInt]
@@ -578,11 +582,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   reqQueues.io.coalescable := coalescer.io.coalescable
   reqQueues.io.invalidate := coalescer.io.invalidate
 
-  // Per-lane request and response queues
+  // ===========================================================================
+  // Request flow
+  // ===========================================================================
   //
   // Override IdentityNode implementation so that we can instantiate
   // queues between input and output edges to buffer requests and responses.
   // See IdentityNode definition in `diplomacy/Nodes.scala`.
+  //
   (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
     case (((tlIn, _), (tlOut, edgeOut)), lane) =>
       // Request queue
@@ -604,7 +611,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
       val deq = reqQueues.io.queue.deq(lane)
       enq.valid := tlIn.a.valid
       enq.bits := req
-      deq.ready := true.B // TODO: deq.ready should respect downstream arbiter
+      // TODO: deq.ready should respect downstream arbiter
+      deq.ready := true.B
+      // Stall upstream core or memtrace driver when shiftqueue is not ready
+      tlIn.a.ready := enq.ready
       tlOut.a.valid := deq.valid
       tlOut.a.bits := deq.bits.toTLA(edgeOut)
 
@@ -641,11 +651,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   tlCoal.e.valid := false.B
 
 
-  // ==================================================================
-  // ******************************************************************
-  // ************************* REORG BOUNDARY *************************
-  // ******************************************************************
-  // ==================================================================
+  // ===========================================================================
+  // Response flow
+  // ===========================================================================
+  //
+  // Connect uncoalescer output and noncoalesced response ports to the response
+  // queues.
 
   // The maximum number of requests from a single lane that can go into a
   // coalesced request.  Upper bound is min(DEPTH, 2**sourceWidth).
@@ -1083,24 +1094,18 @@ class TraceLine extends Bundle with HasTraceLine {
 class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFile: String)
     extends LazyModuleImp(outer)
     with UnitTestModule {
+  // Current cycle mark to read from trace
+  val traceReadCycle = RegInit(1.U(64.W))
 
-  val globalClkCounter    = RegInit(1.U(64.W))
-  val traceReadCycle      = RegInit(1.U(64.W))
-  val downstreamSQready   = WireInit(true.B)
+  // If any of the downstream lane is not ready, hold on from advancing
+  val downstreamReady = outer.laneNodes.map(_.out(0)._1.a.ready).reduce(_ && _)
 
-  //make the downstream only ready 1/4 of the time
-  //This is to test Tracer System's ability to hold on requests
-  //FIXME
-  downstreamSQready       := (globalClkCounter(1,0) =/= 0.U)
-  //Connect Signals to Verilog BlackBox
   val sim = Module(new SimMemTrace(traceFile, config.numLanes))
   sim.io.clock := clock
   sim.io.reset := reset.asBool
-  sim.io.trace_read.ready := downstreamSQready
-  //FIXME - 1.U hardcoded, currently there is a delay between chisel and verilog
+  sim.io.trace_read.ready := downstreamReady
   sim.io.trace_read.cycle := traceReadCycle
 
-
   // Read output from Verilog BlackBox
   // Split output of SimMemTrace, which is flattened across all lanes,back to each lane's.
   val laneReqs = Wire(Vec(config.numLanes, new TraceLine))
@@ -1109,26 +1114,28 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil
   val dataW = laneReqs(0).data.getWidth
   laneReqs.zipWithIndex.foreach { case (req, i) =>
     req.valid := sim.io.trace_read.valid(i)
-    // TODO: driver trace doesn't contain source id
-    req.source := 0.U
+    req.source := 0.U // driver trace doesn't contain source id
     req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i)
     req.is_store := sim.io.trace_read.is_store(i)
     req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i)
     req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i)
   }
 
-  globalClkCounter       := globalClkCounter + 1.U
-  val existValidReq       = WireInit(false.B)
-  existValidReq          := laneReqs.map(_.valid).reduce(_||_)
-  val validReqBlocked     = WireInit(false.B)
-  validReqBlocked        := !downstreamSQready && existValidReq
-  //Debug
-  dontTouch(downstreamSQready)
-  dontTouch(existValidReq)
-  dontTouch(validReqBlocked)
-  // Do Not Update TraceReadCycle if downstream is blocking
-  when(!validReqBlocked){
-    traceReadCycle       := traceReadCycle + 1.U
+  // def missedLine = {
+  //   val existsValidLine = WireInit(false.B)
+  //   existsValidLine := laneReqs.map(_.valid).reduce(_||_)
+  //   val missedLine = WireInit(false.B)
+  //   missedLine := !downstreamReady && existsValidLine
+
+  //   // Debug
+  //   dontTouch(downstreamReady)
+  //   dontTouch(existsValidLine)
+  //   dontTouch(missedLine)
+
+  //   missedLine
+  // }
+  when (downstreamReady){
+    traceReadCycle := traceReadCycle + 1.U
   }
 
   // To prevent collision of sourceId with a current in-flight message,
@@ -1163,19 +1170,6 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil
     val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
     val wordAlignedSize = Mux(subword, 2.U, req.size)
 
-    // when(req.valid && subword) {
-    //   printf(
-    //     "address=%x, size=%d, data=%x, addressMask=%x, wordAlignedAddress=%x, mask=%x, wordData=%x\n",
-    //     req.address,
-    //     req.size,
-    //     req.data,
-    //     ~((1 << log2Ceil(config.WORD_SIZE)) - 1).U(addrW.W),
-    //     wordAlignedAddress,
-    //     mask,
-    //     wordData
-    //   )
-    // }
-
     val (tlOut, edge) = node.out(0)
     val (plegal, pbits) = edge.Put(
       fromSource = sourceIdCounter,
@@ -1356,7 +1350,9 @@ class MemTraceLogger(
 
         // requests on TL A channel
         //
-        req.valid := tlIn.a.valid
+        // Only log trace when fired, e.g. both upstream and downstream is ready
+        // and transaction happened.
+        req.valid := tlIn.a.fire
         req.size := tlIn.a.bits.size
         req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
         req.source := tlIn.a.bits.source
@@ -1364,27 +1360,6 @@ class MemTraceLogger(
         // originally requested, so no postprocessing required
         req.address := tlIn.a.bits.address
 
-        // TL data
-        //
-        // When tlIn.a.bits.size is smaller than the data bus width, need to
-        // figure out which byte lanes we actually accessed so that
-        // we can write that to the memory trace.
-        // See Section 4.5 Byte Lanes in spec 1.8.1
-
-        // This assert only holds true for PutFullData and not PutPartialData,
-        // where HIGH bits in the mask may not be contiguous.
-        assert(
-          PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
-          "mask HIGH bits do not match the TL size.  This should have been handled by the TL generator logic"
-        )
-        val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
-        val dataW = tlIn.params.dataBits
-        val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U))
-        req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
-        // when (req.valid) {
-        //   printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data)
-        // }
-
         when(req.valid) {
           TLPrintf(
             s"MemTraceLogger (${loggerName}:downstream)",
@@ -1397,9 +1372,33 @@ class MemTraceLogger(
           )
         }
 
+        // TL data
+        //
+        // When tlIn.a.bits.size is smaller than the data bus width, need to
+        // figure out which byte lanes we actually accessed so that
+        // we can write that to the memory trace.
+        // See Section 4.5 Byte Lanes in spec 1.8.1
+
+        // This assert only holds true for PutFullData and not PutPartialData,
+        // where HIGH bits in the mask may not be contiguous.
+        assert(
+          PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
+          "mask HIGH popcount do not match the TL size. " +
+          "Partial masks are not allowed for PutFull"
+        )
+        val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
+        val dataW = tlIn.params.dataBits
+        val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U))
+        req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
+        // when (req.valid) {
+        //   printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data)
+        // }
+
         // responses on TL D channel
         //
-        resp.valid := tlOut.d.valid
+        // Only log trace when fired, e.g. both upstream and downstream is ready
+        // and transaction happened.
+        resp.valid := tlOut.d.fire
         resp.size := tlOut.d.bits.size
         resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode)
         resp.source := tlOut.d.bits.source
@@ -1433,7 +1432,7 @@ class MemTraceLogger(
     //
     // This is a clunky workaround of the fact that Chisel doesn't allow partial
     // assignment to a bitfield range of a wide signal.
-    def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
+    def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
       // these will get optimized out
       val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
       val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
@@ -1449,12 +1448,12 @@ class MemTraceLogger(
         vecSize(i) := l.size
         vecData(i) := l.data
       }
-      traceLogIO.valid := vecValid.asUInt
-      traceLogIO.source := vecSource.asUInt
-      traceLogIO.address := vecAddress.asUInt
-      traceLogIO.is_store := vecIsStore.asUInt
-      traceLogIO.size := vecSize.asUInt
-      traceLogIO.data := vecData.asUInt
+      simIO.valid := vecValid.asUInt
+      simIO.source := vecSource.asUInt
+      simIO.address := vecAddress.asUInt
+      simIO.is_store := vecIsStore.asUInt
+      simIO.size := vecSize.asUInt
+      simIO.data := vecData.asUInt
     }
 
     if (simReq.isDefined) {
@@ -1544,7 +1543,7 @@ class DummyDriver(config: CoalescerConfig)(implicit p: Parameters)
     val clientParam = Seq(
       TLMasterParameters.v1(
         name = "dummy-core-node-" + i.toString,
-        sourceId = IdRange(0, defaultConfig.numOldSrcIds)
+        sourceId = IdRange(0, config.numOldSrcIds)
         // visibility = Seq(AddressSet(0x0000, 0xffffff))
       )
     )
@@ -1635,10 +1634,7 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
 }
 
 // tracedriver --> coalescer --> tracelogger --> tlram
-class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule {
-  // val filename = "test.trace"
-  val filename = "vecadd.core1.thread4.trace"
-  // val filename = "nvbit.vecadd.n100000.filter_sm0.trace"
+class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule {
   // TODO: use parameters for numLanes
   val numLanes = defaultConfig.numLanes
 
@@ -1680,13 +1676,14 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule {
           (coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes),
         "FAIL: requests and responses traffic to the coalescer do not match"
       )
+      printf("SUCCESS: coalescer response traffic matched requests!\n")
     }
   }
 }
 
-class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters)
+class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters)
     extends UnitTest(timeout) {
-  val dut = Module(LazyModule(new TLRAMCoalescerLogger).module)
+  val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
   dut.io.start := io.start
   io.finished := dut.io.finished
 }
@@ -2137,3 +2134,5 @@ class CoalArbiterImpl(outer: CoalArbiter,
 
 
 
+
+
diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala
index 2d31711..5a48733 100644
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala
@@ -2,10 +2,12 @@ package freechips.rocketchip.tilelink.coalescing
 
 import chisel3._
 import chiseltest._
+import chiseltest.simulator.VerilatorFlags
 import org.scalatest.flatspec.AnyFlatSpec
 import freechips.rocketchip.tilelink._
 import freechips.rocketchip.util.MultiPortQueue
 import freechips.rocketchip.diplomacy._
+import freechips.rocketchip.subsystem.WithoutTLMonitors
 import org.chipsalliance.cde.config.Parameters
 import chisel3.util.{DecoupledIO, Valid}
 import chisel3.util.experimental.BoringUtils
@@ -190,8 +192,8 @@ object testConfig extends CoalescerConfig(
   respQueueDepth = 4,
   coalLogSizes = Seq(4, 5),
   sizeEnum = DefaultInFlightTableSizeEnum,
-  numArbiterOutputPorts = 4,
   numCoalReqs = 1,
+  numArbiterOutputPorts = 4,
   bankStrideInBytes = 64
 )
 
@@ -229,8 +231,8 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   }
 
   it should "coalesce fully consecutive accesses at size 4, only once" in {
-    test(LazyModule(new DummyCoalescingUnitTB()).module)
-    .withAnnotations(Seq(VerilatorBackendAnnotation, WriteFstAnnotation))
+    test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module)
+    .withAnnotations(Seq(VerilatorBackendAnnotation, VerilatorFlags(Seq("--coverage-line")), WriteFstAnnotation))
 //    .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation))
     { c =>
       val nodes = c.coalIOs.map(_.head)
@@ -291,8 +293,8 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
   }
 
   it should "coalesce identical addresses (stride of 0)" in {
-    test(LazyModule(new DummyCoalescingUnitTB()).module)
-    .withAnnotations(Seq(VcsBackendAnnotation))
+    test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module)
+    .withAnnotations(Seq(VerilatorBackendAnnotation))
     { c =>
       println(s"coalIO length = ${c.coalIOs(0).length}")
       val nodes = c.coalIOs.map(_.head)