From 6cd27faed266342c3c21a0ae5dfe68037797843a Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Sat, 22 Apr 2023 23:04:47 -0700
Subject: [PATCH 1/2] Respect old req size in uncoalescer

---
 src/main/scala/tilelink/Coalescing.scala | 58 ++++++++++++------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 1a06db0..2484bac 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -269,22 +269,24 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule
   // FIXME: don't instantiate inflight table entry type here.  It leaks the table's impl
   // detail to the coalescer
   val offsetBits = 4 // FIXME hardcoded
-  val sizeBits = 2 // FIXME hardcoded
+  val sizeBits = 4 // FIXME hardcoded.  This is should be not the TL size bits
+  // but the width of the size enum
   val newEntry = Wire(
     new InflightCoalReqTableEntry(numLanes, numPerLaneReqs, sourceWidth, offsetBits, sizeBits)
   )
-
   println(s"=========== table sourceWidth: ${sourceWidth}")
   println(s"=========== table sizeBits: ${sizeBits}")
-
   newEntry.source := coalSourceId
+  val coalDataWidth = tlCoal.params.dataBits
+  println(s"=========== coalesced data width: ${coalDataWidth}")
+  val origReqs = reqQueues.map(q => q.io.deq.bits)
   newEntry.lanes.foreach { l =>
     l.reqs.zipWithIndex.foreach { case (r, i) =>
       // TODO: this part needs the actual coalescing logic to work
       r.valid := false.B
-      r.source := i.U // FIXME bogus
-      r.offset := 1.U
-      r.size := 2.U // FIXME hardcoded
+      r.source := origReqs(i).source
+      r.offset := (origReqs(i).address % (coalDataWidth / 8).U) >> log2Ceil(WordSizeInBytes())
+      r.size := origReqs(i).size
     }
   }
   newEntry.lanes(0).reqs(0).valid := true.B
@@ -293,8 +295,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule
   newEntry.lanes(3).reqs(0).valid := true.B
   dontTouch(newEntry)
 
-  // Uncoalescer module sncoalesces responses back to each lane
-  val coalDataWidth = tlCoal.params.dataBits
+  // Uncoalescer module uncoalesces responses back to each lane
   val uncoalescer = Module(
     new UncoalescingUnit(
       numLanes,
@@ -312,8 +313,6 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule
   uncoalescer.io.coalRespSrcId := tlCoal.d.bits.source
   uncoalescer.io.coalRespData := tlCoal.d.bits.data
 
-  println(s"=========== coalRespData width: ${tlCoal.d.bits.data.widthOption.get}")
-
   // Queue up synthesized uncoalesced responses into each lane's response queue
   (respQueues zip uncoalescer.io.uncoalResps).foreach { case (q, lanes) =>
     lanes.zipWithIndex.foreach { case (resp, i) =>
@@ -359,7 +358,10 @@ class UncoalescingUnit(
     val uncoalResps = Output(
       Vec(
         numLanes,
-        Vec(numPerLaneReqs, ValidIO(new RespQueueEntry(sourceWidth, WordSizeInBytes() * 8, sizeWidth)))
+        Vec(
+          numPerLaneReqs,
+          ValidIO(new RespQueueEntry(sourceWidth, WordSizeInBytes() * 8, sizeWidth))
+        )
       )
     )
   })
@@ -380,20 +382,21 @@ class UncoalescingUnit(
 
   // Un-coalescing logic
   //
-  // FIXME: `size` should be UInt, not Int
-  def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, byteSize: Int): UInt = {
-    val bitSize = byteSize * 8
-    val sizeMask = (1.U << bitSize) - 1.U
+  def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, logSize: UInt): UInt = {
+    val sizeInBits = (1.U << logSize) * 8.U
     assert(
-      dataWidth > 0 && dataWidth % bitSize == 0,
-      s"coalesced data width ($dataWidth) not evenly divisible by core req size ($bitSize)"
+      (dataWidth > 0).B && (dataWidth.U % sizeInBits === 0.U),
+      s"coalesced data width ($dataWidth) not evenly divisible by core req size ($sizeInBits)"
     )
-    val numChunks = dataWidth / bitSize
-    val chunks = Wire(Vec(numChunks, UInt(bitSize.W)))
+    assert(logSize === 2.U || logSize === 0.U, "TODO: currently only supporting 4-byte accesses")
+    val numChunks = dataWidth / 32
+    val chunks = Wire(Vec(numChunks, UInt(32.W)))
     val offsets = (0 until numChunks)
     (chunks zip offsets).foreach { case (c, o) =>
-      // Take [(off-1)*size:off*size] starting from MSB
-      c := (data >> (dataWidth - (o + 1) * bitSize)) & sizeMask
+      // Take [(off+1)*size-1:off*size] starting from LSB
+      // FIXME: whether to take the offset from MSB or LSB depends on endianness
+      c := data(32 * (o + 1) - 1, 32 * o)
+    // c := (data >> (dataWidth - (o + 1) * 32)) & sizeMask
     }
     chunks(offset) // MUX
   }
@@ -404,18 +407,16 @@ class UncoalescingUnit(
     perLane.reqs.zipWithIndex.foreach { case (oldReq, i) =>
       val ioOldReq = ioPerLane(i)
 
-      // FIXME: only looking at 0th srcId entry
-
+      // TODO: spatial-only coalescing: only looking at 0th srcId entry
       ioOldReq.valid := false.B
       ioOldReq.bits := DontCare
 
       when(inflightTable.io.lookup.valid) {
         ioOldReq.valid := oldReq.valid
         ioOldReq.bits.source := oldReq.source
-        // FIXME: disregard size enum for now
-        val byteSize = 4
+        ioOldReq.bits.size := oldReq.size
         ioOldReq.bits.data :=
-          getCoalescedDataChunk(io.coalRespData, coalDataWidth, oldReq.offset, byteSize)
+          getCoalescedDataChunk(io.coalRespData, coalDataWidth, oldReq.offset, oldReq.size)
       }
     }
   }
@@ -1145,8 +1146,9 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule {
         coreSideLogger.module.io.respBytes
       )
       assert(
-        coreSideLogger.module.io.numReqs === coreSideLogger.module.io.numResps,
-        "FAIL: number of requests and responses to the coalescer do not match"
+        (coreSideLogger.module.io.numReqs === coreSideLogger.module.io.numResps) &&
+          (coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes),
+        "FAIL: requests and responses traffic to the coalescer do not match"
       )
     }
   }

From 2a82e8d1199849bb9442768d4de7cf7dccd0f386 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Sun, 23 Apr 2023 00:39:59 -0700
Subject: [PATCH 2/2] Fix TL data mask stencil logic in MemTraceLogger

---
 src/main/scala/tilelink/Coalescing.scala | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 2484bac..62a7398 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -181,6 +181,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule
       resp.isStore := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode)
       resp.size := tlOut.d.bits.size
       resp.data := tlOut.d.bits.data
+      // NOTE: D channel doesn't have mask
 
       // Queue up responses that didn't get coalesced originally ("noncoalesced" responses).
       // Coalesced (but uncoalesced back) responses will also be enqueued into the same queue.
@@ -954,8 +955,12 @@ class MemTraceLogger(
           "mask HIGH bits do not match the TL size.  This should have been handled by the TL generator logic"
         )
         val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
-        val mask = ~((~0.U) << (trailingZerosInMask * 8.U))
+        val dataW = tlIn.params.dataBits
+        val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U))
         req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
+        // when (req.valid) {
+        //   printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data)
+        // }
 
         when(req.valid) {
           TracePrintf(