Use DecoupledIO instead of explicit valid in TraceLine bundle

2024-01-21 17:50:00 -08:00
parent d56981a0b1
commit ec02a12220
1 changed files with 105 additions and 96 deletions
--- a/src/main/scala/radiance/memory/Coalescing.scala
+++ b/src/main/scala/radiance/memory/Coalescing.scala
@@ -1502,7 +1502,6 @@ class MemTraceDriver(
 }

 trait HasTraceLine {
-  val valid: UInt
  val source: UInt
  val address: UInt
  val is_store: UInt
@@ -1513,7 +1512,6 @@ trait HasTraceLine {
 // Used for both request and response.  Response had address set to 0
 // NOTE: these widths have to agree with what's hardcoded in Verilog.
 class TraceLine extends Bundle with HasTraceLine {
-  val valid = Bool()
  val source = UInt(32.W)
  val address = UInt(64.W)
  val is_store = Bool()
@@ -1538,7 +1536,7 @@ class MemTraceDriverImp(
  // downstream take requests from the queue individually for each lane,
  // but do synchronized enqueue whenever all lane queue is ready to prevent
  // drifts between the lane.
-  val reqQueues = Seq.fill(config.numLanes)(Module(new Queue(new TraceLine, 2)))
+  val reqQueues = Seq.fill(config.numLanes)(Module(new Queue(Valid(new TraceLine), 2)))
  // Are we safe to read the next warp?
  val reqQueueAllReady = reqQueues.map(_.io.enq.ready).reduce(_ && _)

@@ -1552,17 +1550,17 @@ class MemTraceDriverImp(

  // Read output from Verilog BlackBox
  // Split output of SimMemTrace, which is flattened across all lanes,back to each lane's.
-  val laneReqs = Wire(Vec(config.numLanes, new TraceLine))
-  val addrW = laneReqs(0).address.getWidth
-  val sizeW = laneReqs(0).size.getWidth
-  val dataW = laneReqs(0).data.getWidth
+  val laneReqs = Wire(Vec(config.numLanes, Valid(new TraceLine)))
+  val addrW = laneReqs(0).bits.address.getWidth
+  val sizeW = laneReqs(0).bits.size.getWidth
+  val dataW = laneReqs(0).bits.data.getWidth
  laneReqs.zipWithIndex.foreach { case (req, i) =>
    req.valid := sim.io.trace_read.valid(i)
-    req.source := 0.U // driver trace doesn't contain source id
-    req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i)
-    req.is_store := sim.io.trace_read.is_store(i)
-    req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i)
-    req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i)
+    req.bits.source := 0.U // driver trace doesn't contain source id
+    req.bits.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i)
+    req.bits.is_store := sim.io.trace_read.is_store(i)
+    req.bits.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i)
+    req.bits.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i)
  }

  // Not all fire because trace cycle has to advance even when there is no valid
@@ -1610,19 +1608,19 @@ class MemTraceDriverImp(
      // the trace driver to act so as well.
      // That means if req.size is smaller than word size, we need to pad data
      // with zeros to generate a word-size request, and set mask accordingly.
-      val offsetInWord = req.address % config.wordSizeInBytes.U
-      val subword = req.size < log2Ceil(config.wordSizeInBytes).U
+      val offsetInWord = req.bits.address % config.wordSizeInBytes.U
+      val subword = req.bits.size < log2Ceil(config.wordSizeInBytes).U

      // `mask` is currently unused
      // val mask = Wire(UInt(config.wordSizeInBytes.W))
      val wordData = Wire(UInt((config.wordSizeInBytes * 8 * 2).W))
      val sizeInBytes = Wire(UInt((sizeW + 1).W))
-      sizeInBytes := (1.U) << req.size
+      sizeInBytes := (1.U) << req.bits.size
      // mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
-      wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
+      wordData := Mux(subword, req.bits.data << (offsetInWord * 8.U), req.bits.data)
      val wordAlignedAddress =
-        req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
-      val wordAlignedSize = Mux(subword, 2.U, req.size)
+        req.bits.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
+      val wordAlignedSize = Mux(subword, 2.U, req.bits.size)

      val sourceGen = sourceGens(lane)
      sourceGen.io.gen := tlOut.a.fire
@@ -1644,8 +1642,8 @@ class MemTraceDriverImp(
        toAddress = hashToValidPhyAddr(wordAlignedAddress),
        lgSize = wordAlignedSize
      )
-      val legal = Mux(req.is_store, plegal, glegal)
-      val bits = Mux(req.is_store, pbits, gbits)
+      val legal = Mux(req.bits.is_store, plegal, glegal)
+      val bits = Mux(req.bits.is_store, pbits, gbits)

      tlOut.a.valid := reqQ.io.deq.valid && syncedSourceGenValid
      when(tlOut.a.fire) {
@@ -1667,9 +1665,9 @@ class MemTraceDriverImp(
          tlOut.a.bits.address,
          tlOut.a.bits.size,
          tlOut.a.bits.mask,
-          req.is_store,
+          req.bits.is_store,
          tlOut.a.bits.data,
-          req.data
+          req.bits.data
        )
      }
      dontTouch(tlOut.a)
@@ -1809,8 +1807,8 @@ class MemTraceLogger(
      simResp.get.io.reset := reset.asBool
    }

-    val laneReqs = Wire(Vec(numLanes, new TraceLine))
-    val laneResps = Wire(Vec(numLanes, new TraceLine))
+    val laneReqs = Wire(Vec(numLanes, Valid(new TraceLine)))
+    val laneResps = Wire(Vec(numLanes, Valid(new TraceLine)))

    assert(
      numLanes == node.in.length,
@@ -1828,12 +1826,12 @@ class MemTraceLogger(
        // Only log trace when fired, e.g. both upstream and downstream is ready
        // and transaction happened.
        req.valid := tlIn.a.fire
-        req.size := tlIn.a.bits.size
-        req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode, tlIn.a.fire)
-        req.source := tlIn.a.bits.source
+        req.bits.size := tlIn.a.bits.size
+        req.bits.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode, tlIn.a.fire)
+        req.bits.source := tlIn.a.bits.source
        // TL always carries the exact unaligned address that the client
        // originally requested, so no postprocessing required
-        req.address := tlIn.a.bits.address
+        req.bits.address := tlIn.a.bits.address

        when(req.valid) {
          TLPrintf(
@@ -1842,9 +1840,9 @@ class MemTraceLogger(
            tlIn.a.bits.address,
            tlIn.a.bits.size,
            tlIn.a.bits.mask,
-            req.is_store,
+            req.bits.is_store,
            tlIn.a.bits.data,
-            req.data
+            req.bits.data
          )
        }

@@ -1868,9 +1866,9 @@ class MemTraceLogger(
        val dataW = tlIn.params.dataBits
        val sizeInBits = (1.U(1.W) << tlIn.a.bits.size) << 3.U
        val mask = ~(~(0.U(dataW.W)) << sizeInBits)
-        req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
-        // when (req.valid) {
-        //   printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data)
+        req.bits.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
+        // when (req.bits.valid) {
+        //   printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.bits.data)
        // }

        // responses on TL D channel
@@ -1878,18 +1876,18 @@ class MemTraceLogger(
        // Only log trace when fired, e.g. both upstream and downstream is ready
        // and transaction happened.
        resp.valid := tlOut.d.fire
-        resp.size := tlOut.d.bits.size
-        resp.is_store := TLUtils.DOpcodeIsStore(
+        resp.bits.size := tlOut.d.bits.size
+        resp.bits.is_store := TLUtils.DOpcodeIsStore(
          tlOut.d.bits.opcode,
          tlOut.d.fire
        )
-        resp.source := tlOut.d.bits.source
+        resp.bits.source := tlOut.d.bits.source
        // NOTE: TL D channel doesn't carry address nor mask, so there's no easy
        // way to figure out which bytes the master actually use.  Since we
        // don't care too much about addresses in the trace anyway, just store
        // the entire bits.
-        resp.address := 0.U
-        resp.data := tlOut.d.bits.data
+        resp.bits.address := 0.U
+        resp.bits.data := tlOut.d.bits.data
    }

    // stats
@@ -1903,13 +1901,13 @@ class MemTraceLogger(
      }
    val reqBytesThisCycle =
      laneReqs
-        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .map { l => Mux(l.valid, 1.U(64.W) << l.bits.size, 0.U(64.W)) }
        .reduce { (b0, b1) =>
          b0 + b1
        }
    val respBytesThisCycle =
      laneResps
-        .map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }
+        .map { l => Mux(l.valid, 1.U(64.W) << l.bits.size, 0.U(64.W)) }
        .reduce { (b0, b1) =>
          b0 + b1
        }
@@ -1922,42 +1920,25 @@ class MemTraceLogger(
    //
    // This is a clunky workaround of the fact that Chisel doesn't allow partial
    // assignment to a bitfield range of a wide signal.
-    def flattenTrace(
-        simIO: Bundle with HasTraceLine,
-        perLane: Vec[TraceLine]
-    ) = {
-      // these will get optimized out
-      val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
-      val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
-      val vecAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address)))
-      val vecIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store)))
-      val vecSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size)))
-      val vecData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data)))
-      perLane.zipWithIndex.foreach { case (l, i) =>
-        vecValid(i) := l.valid
-        vecSource(i) := l.source
-        vecAddress(i) := l.address
-        vecIsStore(i) := l.is_store
-        vecSize(i) := l.size
-        vecData(i) := l.data
-      }
-      simIO.valid := vecValid.asUInt
-      simIO.source := vecSource.asUInt
-      simIO.address := vecAddress.asUInt
-      simIO.is_store := vecIsStore.asUInt
-      simIO.size := vecSize.asUInt
-      simIO.data := vecData.asUInt
-    }
-
    if (simReq.isDefined) {
-      flattenTrace(simReq.get.io.trace_log, laneReqs)
+      simReq.get.io.trace_log.valid := VecInit(laneReqs.map(_.valid)).asUInt
+      simReq.get.io.trace_log.source := VecInit(laneReqs.map(_.bits.source)).asUInt
+      simReq.get.io.trace_log.address := VecInit(laneReqs.map(_.bits.address)).asUInt
+      simReq.get.io.trace_log.is_store := VecInit(laneReqs.map(_.bits.is_store)).asUInt
+      simReq.get.io.trace_log.size := VecInit(laneReqs.map(_.bits.size)).asUInt
+      simReq.get.io.trace_log.data := VecInit(laneReqs.map(_.bits.data)).asUInt
      assert(
        simReq.get.io.trace_log.ready === true.B,
        "MemTraceLogger is expected to be always ready"
      )
    }
    if (simResp.isDefined) {
-      flattenTrace(simResp.get.io.trace_log, laneResps)
+      simResp.get.io.trace_log.valid := VecInit(laneResps.map(_.valid)).asUInt
+      simResp.get.io.trace_log.source := VecInit(laneResps.map(_.bits.source)).asUInt
+      simResp.get.io.trace_log.address := VecInit(laneResps.map(_.bits.address)).asUInt
+      simResp.get.io.trace_log.is_store := VecInit(laneResps.map(_.bits.is_store)).asUInt
+      simResp.get.io.trace_log.size := VecInit(laneResps.map(_.bits.size)).asUInt
+      simResp.get.io.trace_log.data := VecInit(laneResps.map(_.bits.data)).asUInt
      assert(
        simResp.get.io.trace_log.ready === true.B,
        "MemTraceLogger is expected to be always ready"
@@ -1994,7 +1975,7 @@ class SimMemTraceLogger(
    val clock = Input(Clock())
    val reset = Input(Bool())

-    val trace_log = new Bundle with HasTraceLine {
+    val trace_log = new Bundle {
      val valid = Input(UInt(numLanes.W))
      val source = Input(UInt((sourceW * numLanes).W))
      // Chisel can't interface with Verilog 2D port, so flatten all lanes into
@@ -2074,24 +2055,40 @@ class MemFuzzerImp(
  sim.io.clock := clock
  sim.io.reset := reset.asBool

-  sim.io.a.ready := true.B // FIXME
+  sim.io.a.ready := VecInit(outer.laneNodes.map { node =>
+    val (tlOut, _) = node.out(0)
+    tlOut.a.ready
+  }).asUInt

  io.finished := sim.io.finished

-  // Read output from Verilog BlackBox
-  // Split output of SimMemTrace, which is flattened across all lanes,back to each lane's.
-  val laneReqs = Wire(Vec(config.numLanes, new TraceLine))
-  val addrW = laneReqs(0).address.getWidth
-  val sizeW = laneReqs(0).size.getWidth
-  val dataW = laneReqs(0).data.getWidth
+  // connect Verilog <-> Chisel IO
+  // Verilog IO flattened across all lanes
+  val laneReqs = Wire(Vec(config.numLanes, Decoupled(new TraceLine)))
+  val addrW = laneReqs(0).bits.address.getWidth
+  val sizeW = laneReqs(0).bits.size.getWidth
+  val dataW = laneReqs(0).bits.data.getWidth
  laneReqs.zipWithIndex.foreach { case (req, i) =>
    req.valid := sim.io.a.valid(i)
-    req.source := 0.U // DPI fuzzer doesn't generate contain source id
-    req.address := sim.io.a.address(addrW * (i + 1) - 1, addrW * i)
-    req.is_store := sim.io.a.is_store(i)
-    req.size := sim.io.a.size(sizeW * (i + 1) - 1, sizeW * i)
-    req.data := sim.io.a.data(dataW * (i + 1) - 1, dataW * i)
+    req.bits.source := 0.U // DPI fuzzer doesn't generate contain source id
+    req.bits.address := sim.io.a.address(addrW * (i + 1) - 1, addrW * i)
+    req.bits.is_store := sim.io.a.is_store(i)
+    req.bits.size := sim.io.a.size(sizeW * (i + 1) - 1, sizeW * i)
+    req.bits.data := sim.io.a.data(dataW * (i + 1) - 1, dataW * i)
  }
+  sim.io.a.ready := VecInit(laneReqs.map(_.ready)).asUInt
+
+  val laneResps = Wire(Vec(config.numLanes, Flipped(Decoupled(new TraceLine))))
+  laneResps.zipWithIndex.foreach { case (resp, i) =>
+    resp.ready := sim.io.d.ready(i)
+    // TODO: not handled in DPI
+    resp.bits.source := DontCare
+    resp.bits.address := DontCare
+    resp.bits.data := DontCare
+  }
+  sim.io.d.valid := VecInit(laneResps.map(_.valid)).asUInt
+  sim.io.d.is_store := VecInit(laneResps.map(_.bits.is_store)).asUInt
+  sim.io.d.size := VecInit(laneResps.map(_.bits.size)).asUInt

  val sourceGens = Seq.fill(config.numLanes)(
    Module(
@@ -2103,27 +2100,29 @@ class MemFuzzerImp(
  )

  // Take requests off of the queue and generate TL requests
-  (outer.laneNodes zip laneReqs).zipWithIndex.foreach {
-    case ((node, req), lane) =>
+  (outer.laneNodes zip (laneReqs zip laneResps)).zipWithIndex.foreach {
+    case ((node, (req, resp)), lane) =>
      val (tlOut, edge) = node.out(0)

+      // Requests --------------------------------------------------------------
+      //
      // Core only makes accesses of granularity larger than a word, so we want
      // the trace driver to act so as well.
      // That means if req.size is smaller than word size, we need to pad data
      // with zeros to generate a word-size request, and set mask accordingly.
-      val offsetInWord = req.address % config.wordSizeInBytes.U
-      val subword = req.size < log2Ceil(config.wordSizeInBytes).U
+      val offsetInWord = req.bits.address % config.wordSizeInBytes.U
+      val subword = req.bits.size < log2Ceil(config.wordSizeInBytes).U

      // `mask` is currently unused
      // val mask = Wire(UInt(config.wordSizeInBytes.W))
      val wordData = Wire(UInt((config.wordSizeInBytes * 8 * 2).W))
      val sizeInBytes = Wire(UInt((sizeW + 1).W))
-      sizeInBytes := (1.U) << req.size
+      sizeInBytes := (1.U) << req.bits.size
      // mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
-      wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data)
+      wordData := Mux(subword, req.bits.data << (offsetInWord * 8.U), req.bits.data)
      val wordAlignedAddress =
-        req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
-      val wordAlignedSize = Mux(subword, 2.U, req.size)
+        req.bits.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
+      val wordAlignedSize = Mux(subword, 2.U, req.bits.size)

      val sourceGen = sourceGens(lane)
      sourceGen.io.gen := tlOut.a.fire
@@ -2144,19 +2143,26 @@ class MemFuzzerImp(
        toAddress = wordAlignedAddress,
        lgSize = wordAlignedSize
      )
-      val legal = Mux(req.is_store, plegal, glegal)
-      val bits = Mux(req.is_store, pbits, gbits)
+      val legal = Mux(req.bits.is_store, plegal, glegal)
+      val bits = Mux(req.bits.is_store, pbits, gbits)

      tlOut.a.valid := req.valid && sourceGen.io.id.valid
-      // req.ready := tlOut.a.ready && sourceGen.io.id.valid
+      req.ready := tlOut.a.ready && sourceGen.io.id.valid

      when(tlOut.a.fire) {
        assert(legal, "illegal TL req gen")
      }
      tlOut.a.bits := bits
+
+      // Responses -------------------------------------------------------------
+      //
+      tlOut.d.ready := resp.ready
+      resp.valid := tlOut.d.valid
+      resp.bits.is_store := !edge.hasData(tlOut.d.bits)
+      resp.bits.size := tlOut.d.bits.size
+
      tlOut.b.ready := true.B
      tlOut.c.valid := false.B
-      tlOut.d.ready := sim.io.d.ready(lane) // FIXME
      tlOut.e.valid := false.B

      // debug
@@ -2168,9 +2174,9 @@ class MemFuzzerImp(
          tlOut.a.bits.address,
          tlOut.a.bits.size,
          tlOut.a.bits.mask,
-          req.is_store,
+          req.bits.is_store,
          tlOut.a.bits.data,
-          req.data
+          req.bits.data
        )
      }
      dontTouch(tlOut.a)
@@ -2210,6 +2216,9 @@ class SimMemFuzzer(numLanes: Int) extends BlackBox
    val d =
      new Bundle {
        val ready = Output(UInt(numLanes.W))
+        val valid = Input(UInt(numLanes.W))
+        val is_store = Input(UInt(numLanes.W))
+        val size = Input(UInt((sizeW * numLanes).W))
      }
  })