FatBank Integration Improvements:

1. ensure FatBank prioritze Ack read over Ack write to downstream coalescer 2. Between FatBank and L2, use the new sourceGenerator to allow both Read and Write Reqs sharing the same pool of available src_ids
2023-10-22 17:03:37 -07:00
parent e50903ed42
commit 60a63d4e11
1 changed files with 124 additions and 56 deletions
--- a/src/main/scala/rocket/VortexFatBank.scala
+++ b/src/main/scala/rocket/VortexFatBank.scala
@@ -10,6 +10,88 @@ import freechips.rocketchip.tilelink._
 import org.chipsalliance.cde.config.{Parameters, Field}


+
+
+//<FIXME> Delete the following NewSourceGenerator when merging with origin/graphics
+//we should just use the one in coalescing.scala written by hansung
+
+class NewSourceGenerator[T <: Data](
+  sourceWidth: Int,
+  metadata: Option[T] = None,
+  ignoreInUse: Boolean = false
+) extends Module {
+  def getMetadataType = metadata match {
+    case Some(gen) => gen.cloneType
+    case None => UInt(0.W)
+  }
+  val io = IO(new Bundle {
+    val gen = Input(Bool())
+    val reclaim = Input(Valid(UInt(sourceWidth.W)))
+    val id = Output(Valid(UInt(sourceWidth.W)))
+    // below are used only when metadata is not None
+    // `meta` is used as input when a request succeeds id generation to store
+    // its value to the table.
+    // `peek` is the retrieved metadata saved for the request when corresponding
+    // request has come back, setting `reclaim`.
+    // Although these do not use ValidIO, it is safe because any in-flight
+    // response coming back should have allocated a valid entry in the table
+    // when it went out.
+    val meta = Input(getMetadataType)
+    val peek = Output(getMetadataType)
+    // for debugging; indicates whether there is at least one inflight request
+    // that hasn't been reclaimed yet
+    val inflight = Output(Bool())
+  })
+  val head = RegInit(UInt(sourceWidth.W), 0.U)
+  head := Mux(io.gen, head + 1.U, head)
+
+  val outstanding = RegInit(UInt((sourceWidth + 1).W), 0.U)
+  io.inflight := (outstanding > 0.U) || io.gen
+
+  val numSourceId = 1 << sourceWidth
+  val row = new Bundle {
+    val meta = getMetadataType
+    val id = Valid(UInt(sourceWidth.W))
+  }
+  // valid: in use, invalid: available
+  // val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W)))
+  val occupancyTable = Mem(numSourceId, row)
+  when(reset.asBool) {
+    (0 until numSourceId).foreach { occupancyTable(_).id.valid := false.B }
+  }
+  val frees = (0 until numSourceId).map(!occupancyTable(_).id.valid)
+  val lowestFree = PriorityEncoder(frees)
+  val lowestFreeRow = occupancyTable(lowestFree)
+
+  io.id.valid := (if (ignoreInUse) true.B else !lowestFreeRow.id.valid)
+  io.id.bits := lowestFree
+  when(io.gen && io.id.valid /* fire */ ) {
+    occupancyTable(io.id.bits).id.valid := true.B // mark in use
+    if (metadata.isDefined) {
+      occupancyTable(io.id.bits).meta := io.meta
+    }
+  }
+  when(io.reclaim.valid) {
+    // @perf: would this require multiple write ports?
+    occupancyTable(io.reclaim.bits).id.valid := false.B // mark freed
+  }
+  io.peek := {
+    if (metadata.isDefined) occupancyTable(io.reclaim.bits).meta else 0.U
+  }
+
+  when(io.gen && io.id.valid) {
+    when (!io.reclaim.valid) {
+      assert(outstanding < (1 << sourceWidth).U)
+      outstanding := outstanding + 1.U
+    }
+  }.elsewhen(io.reclaim.valid) {
+    assert(outstanding > 0.U)
+    outstanding := outstanding - 1.U
+  }
+  dontTouch(outstanding)
+}
+
+
 // VortexTile has dmemNodes, imemNodes

 //Param and Key are used during SoC Generation
@@ -21,6 +103,8 @@ case class VortexFatBankConfig(
    wordSize: Int,      //This is the read/write granularity of the L1 cache
    cacheLineSize: Int,
    coreTagWidth: Int,
+    writeInfoReqQSize: Int,
+    mshrSize: Int
 ) {
    def coreTagPlusSizeWidth: Int = {
        log2Ceil(wordSize) + coreTagWidth
@@ -31,6 +115,8 @@ object defaultFatBankConfig extends VortexFatBankConfig(
    wordSize = 16,
    cacheLineSize = 16,
    coreTagWidth = 8,
+    writeInfoReqQSize = 16,
+    mshrSize = 8
 )


@@ -83,7 +169,8 @@ class VortexFatBankImp (
    val vxCache = Module(new VX_cache(
        WORD_SIZE=config.wordSize, 
        CACHE_LINE_SIZE=config.cacheLineSize,
-        CORE_TAG_WIDTH= config.coreTagPlusSizeWidth
+        CORE_TAG_WIDTH= config.coreTagPlusSizeWidth,
+        MSHR_SIZE= config.mshrSize
        )
    );

@@ -94,16 +181,13 @@ class VortexFatBankImp (
        val id = UInt(32.W)
        val size = UInt(32.W)
    }
-    

-    //<FIXME> assuming this is never full
-    val rcvWriteReqInfo = Module(new Queue((new WriteReqInfo).cloneType, 64, true, false))
-    
    class ReadReqInfo(config: VortexFatBankConfig) extends Bundle {
        val size = UInt(log2Ceil(config.wordSize).W)
        val id   = UInt(config.coreTagWidth.W)
    }

+    val rcvWriteReqInfo = Module(new Queue((new WriteReqInfo).cloneType, config.writeInfoReqQSize, true, false))
    val readReqInfo = Wire(new ReadReqInfo(config))

    // Translate TL request from Coalescer to requests for VX_cache
@@ -112,16 +196,17 @@ class VortexFatBankImp (
        // coal -> vxCache request on channel A
        val coalToBankA = coalToBankBundle.a;
        
-        coalToBankA.ready := vxCache.io.core_req_ready
+        coalToBankA.ready := vxCache.io.core_req_ready && rcvWriteReqInfo.io.enq.ready //not optimal
        vxCache.io.core_req_valid := coalToBankA.valid

        // read = 0, write = 1
        vxCache.io.core_req_rw     := !(coalToBankA.bits.opcode === TLMessages.Get)
        //4 is also hardcoded, it should be log2WordSize
-        vxCache.io.core_req_addr   := coalToBankA.bits.address(31, 4)
+        vxCache.io.core_req_addr   := coalToBankA.bits.address(31, log2Ceil(config.wordSize))
        vxCache.io.core_req_byteen := coalToBankA.bits.mask
        vxCache.io.core_req_data   := coalToBankA.bits.data
        
+        //combine size and tag field into one big wire, to put into vxCache.io.core_req_tag
        readReqInfo.id   := coalToBankA.bits.source
        readReqInfo.size := coalToBankA.bits.size
        vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag)
@@ -135,9 +220,7 @@ class VortexFatBankImp (
        // if we don't send ack, the coalescer will run out of IDs, and can't generate new request

        // for read request, we send AckData when the FatBank has a valid output
-        // for write request, we can immediate Ack on the next clock cycle (not the same clock cycle, otherwise critical path too long)
-        // It's possible that on the same cycle, we need to do both "AckData" and "Ack"
-        //    in this case, we always priorize "Ack", this makes the design easier
+        // for write request, we can ack whenever we have a valid entry in rcvWriteReqInfo Queue

        //I think this just shows the flaws of Tilelink. CPU never waits for an Ack upon regular write request
        //the Core should unconditionally move forward after every regular write request
@@ -150,39 +233,29 @@ class VortexFatBankImp (
        rcvWriteReqInfo.io.enq.bits.id   := coalToBankA.bits.source
        rcvWriteReqInfo.io.enq.bits.size := coalToBankA.bits.size

-
-        rcvWriteReqInfo.io.deq.ready := coalToBankD.ready
+        //prioritize Ack for Read, so we only deque from writeReqInfo, if we don't have a readReq we need to ack
+        //vxCache.io.core_rsp_valid means readDataAck
+        rcvWriteReqInfo.io.deq.ready := coalToBankD.ready && ~vxCache.io.core_rsp_valid 
        
-        //if we "need" to do Ack
-        //we unconditionally set the vxCache.ready to be false, so it gets delayed
-        vxCache.io.core_rsp_ready := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            false.B,
-            coalToBankD.ready
-        )
-
-        coalToBankD.valid := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            true.B,
-            vxCache.io.core_rsp_valid
-        )
+        vxCache.io.core_rsp_ready := coalToBankD.ready
+        coalToBankD.valid := vxCache.io.core_rsp_valid || rcvWriteReqInfo.io.deq.valid

        coalToBankD.bits.source := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            rcvWriteReqInfo.io.deq.bits.id,
-            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id
+            vxCache.io.core_rsp_valid,
+            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id,
+            rcvWriteReqInfo.io.deq.bits.id
        )

        coalToBankD.bits.opcode  := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            TLMessages.AccessAck,
-            TLMessages.AccessAckData
+            vxCache.io.core_rsp_valid,
+            TLMessages.AccessAckData,
+            TLMessages.AccessAck
        )

        coalToBankD.bits.size := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            rcvWriteReqInfo.io.deq.bits.size,
-            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size
+            vxCache.io.core_rsp_valid,
+            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size,
+            rcvWriteReqInfo.io.deq.bits.size
        )

        coalToBankD.bits.param   := 0.U
@@ -190,7 +263,7 @@ class VortexFatBankImp (
        coalToBankD.bits.denied  := false.B
        coalToBankD.bits.corrupt := false.B

-        coalToBankD.bits.data   := vxCache.io.core_rsp_data
+        coalToBankD.bits.data    := vxCache.io.core_rsp_data
    }


@@ -200,10 +273,10 @@ class VortexFatBankImp (
    //vx_cache can indeed guarantee that all active read operation has unique ID
    //However, since the cache is write_through, so it can't ensure unique ID for write operation
    //Therefore, we need our own internal source_ID generator for all write operation
-    //
-    //Now, we allocate id range: 0-15 for all write operation
-    //                    and    16-> above for read operation
-    val sourceGen = Module( new SourceGenerator(log2Ceil(16), ignoreInUse = false))
+    
+
+    val sourceGen = Module( new NewSourceGenerator(log2Ceil(config.mshrSize), metadata = Some(UInt(32.W)), ignoreInUse = false))
+    


    // Translate VX_cache mem request to a TL request to be sent to L2
@@ -215,13 +288,9 @@ class VortexFatBankImp (

        //Read Operation is ready as long as downstream L2 is ready

-        vxCache.io.mem_req_ready := vxCacheToL2A.ready
+        vxCache.io.mem_req_ready := vxCacheToL2A.ready 

-        vxCacheToL2A.valid := Mux(
-            vxCache.io.mem_req_rw,
-            vxCache.io.mem_req_valid && sourceGen.io.id.valid,
-            vxCache.io.mem_req_valid
-        )
+        vxCacheToL2A.valid := vxCache.io.mem_req_valid && sourceGen.io.id.valid

        vxCacheToL2A.bits.opcode := Mux(
            vxCache.io.mem_req_rw, 
@@ -230,21 +299,19 @@ class VortexFatBankImp (
        )

        vxCacheToL2A.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W))
+
        vxCacheToL2A.bits.mask    := Mux(
            vxCache.io.mem_req_rw, 
            vxCache.io.mem_req_byteen,
            0xFFFF.U
        )
+
        vxCacheToL2A.bits.data    := vxCache.io.mem_req_data

-        
-        vxCacheToL2A.bits.source  := Mux(
-            vxCache.io.mem_req_rw,
-            sourceGen.io.id.bits,
-            vxCache.io.mem_req_tag + 16.U
-        )
-        //mark current source_id as in-use
-        sourceGen.io.gen := vxCache.io.mem_req_rw && vxCacheToL2A.ready && vxCacheToL2A.valid
+        vxCacheToL2A.bits.source  := sourceGen.io.id.bits
+
+        sourceGen.io.gen  := vxCacheToL2A.ready && vxCacheToL2A.valid
+        sourceGen.io.meta := vxCache.io.mem_req_tag //save the old read id

        vxCacheToL2A.bits.param   := 0.U
        vxCacheToL2A.bits.size    := 4.U
@@ -257,10 +324,11 @@ class VortexFatBankImp (
        vxCacheToL2D.ready := vxCache.io.mem_rsp_ready

        vxCache.io.mem_rsp_valid := vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAckData
-        vxCache.io.mem_rsp_tag   := vxCacheToL2D.bits.source - 16.U // -16 for read resp, we can safely do this, since write-ack wouldn't pass through
-        vxCache.io.mem_rsp_data := vxCacheToL2D.bits.data
+        vxCache.io.mem_rsp_tag   := sourceGen.io.peek
+        vxCache.io.mem_rsp_data  := vxCacheToL2D.bits.data

-        sourceGen.io.reclaim.valid := vxCacheToL2D.ready && vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAck
+        // all ids needs to be reclaimed
+        sourceGen.io.reclaim.valid := vxCacheToL2D.ready && vxCacheToL2D.valid
        sourceGen.io.reclaim.bits := vxCacheToL2D.bits.source

    }