From 127d7613e1414e374b525188226a1b6bb5de7c32 Mon Sep 17 00:00:00 2001
From: joshua <jyou12@berkeley.edu>
Date: Mon, 9 Oct 2023 14:49:57 -0700
Subject: [PATCH 1/3] add vortex fat bank + test (not compiling atm)

---
 src/main/resources/vsrc/vortex            |   2 +-
 src/main/scala/rocket/VortexFatBank.scala | 273 ++++++++++++++++++++++
 2 files changed, 274 insertions(+), 1 deletion(-)
 create mode 100644 src/main/scala/rocket/VortexFatBank.scala

diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex
index 1081f8a..0ddf415 160000
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
@@ -1 +1 @@
-Subproject commit 1081f8a485565d3a6db0cfa416a5bd9869185f1f
+Subproject commit 0ddf4152ee0d4d69e5bd42213f5e51bda4e74ac1
diff --git a/src/main/scala/rocket/VortexFatBank.scala b/src/main/scala/rocket/VortexFatBank.scala
new file mode 100644
index 0000000..c1b6709
--- /dev/null
+++ b/src/main/scala/rocket/VortexFatBank.scala
@@ -0,0 +1,273 @@
+package rocket
+
+import chisel3._
+import chisel3.util._
+import chisel3.experimental._
+import org.chipsalliance.cde.config.Parameters
+import freechips.rocketchip.diplomacy._
+import freechips.rocketchip.tilelink._
+
+// VortexTile has dmemNodes, imemNodes
+
+class VortexFatBank (
+
+) (implicit p: Parameters) extends LazyModule {
+
+    val clientParam = Seq(TLMasterPortParameters.v1(
+        clients = Seq(
+            TLMasterParameters.v1(
+                name = "VortexFatBank",
+                sourceId = IdRange(0, 1 << 14), // FIXME: magic number
+                supportsProbe = TransferSizes(1, 16),
+                supportsGet = TransferSizes(1, 16),
+                supportsPutFull = TransferSizes(1, 16),
+                supportsPutPartial = TransferSizes(1, 16)
+            )
+        )
+    ))
+
+    val managerParam = Seq(TLSlavePortParameters.v1(
+        beatBytes = 16,
+        managers = Seq(
+            TLSlaveParameters.v1(
+                address            = Seq(AddressSet(0x0, 0xffffff)), // intercept all requests (it does not like all F for some reason)
+                regionType         = RegionType.IDEMPOTENT, // idk what this does
+                executable         = false,
+                supportsGet        = TransferSizes(1, 16),
+                supportsPutPartial = TransferSizes(1, 16),
+                supportsPutFull    = TransferSizes(1, 16),
+                fifoId             = Some(0)
+            )
+        )
+    ))
+
+    val coalToBankNode = TLManagerNode(managerParam)
+    val bankToL2Node = TLClientNode(clientParam)
+    lazy val module = new VortexFatBankImp(this);
+}
+
+class VortexFatBankImp (
+    outer: VortexFatBank
+) extends LazyModuleImp(outer) {
+    val bank = Module(new VX_Cache());
+
+    bank.io.clk := clock
+    bank.io.reset := reset
+
+    // Translate TL request from Coalescer to requests for VX_Cache
+    def TLReq2VXReq = {
+        val (coalToBankBundle, _) = outer.coalToBankNode.in.head
+        // coal -> bank request on channel A
+        val coalToBankA = coalToBankBundle.a;
+        
+        coalToBankA.ready := bank.io.core_req_ready
+        bank.io.core_req_valid := coalToBankA.valid
+
+        // read = 0, write = 1
+        bank.io.core_req_rw     := !(coalToBankA.bits.opcode === TLMessages.Get)
+        bank.io.core_req_addr   := coalToBankA.bits.address(31, 4)
+        bank.io.core_req_byteen := coalToBankA.bits.mask
+        bank.io.core_req_data   := coalToBankA.bits.data
+        bank.io.core_req_tag    := coalToBankA.bits.source
+
+        // we ignore param, size, corrupt fields
+
+        // bank -> coal response on channel D
+        val coalToBankD = coalToBankBundle.d;
+
+        bank.io.core_rsp_ready := coalToBankD.ready
+        coalToBankD.valid := bank.io.core_rsp_valid
+
+        // Cache is not TL compliant since we don't generate AccessAcks on successful writes
+        // but VortexCore is not expecting them anyways
+        coalToBankD.bits.opcode  := TLMessages.AccessAckData
+        coalToBankD.bits.param   := 0.U
+        coalToBankD.bits.size    := 4.U
+        coalToBankD.bits.sink    := 0.U
+        coalToBankD.bits.denied  := false.B
+        coalToBankD.bits.corrupt := false.B
+
+        coalToBankD.bits.data   := bank.io.core_rsp_data
+        coalToBankD.bits.source := bank.io.core_rsp_tag
+    }
+
+    // Translate VX_Cache mem request to a TL request to be sent to L2
+    def VXReq2TLReq = {
+        val (bankToL2Bundle, _) = outer.bankToL2Node.out.head
+        // bank -> L2 request on channel A
+        val bankToL2A = bankToL2Bundle.a;
+        
+
+        bank.io.mem_req_ready := bankToL2A.ready
+        bankToL2A.valid := bank.io.mem_req_valid 
+
+        bankToL2A.bits.opcode := Mux(
+            bank.io.mem_req_rw, 
+            Mux(bank.io.mem_req_byteen.andR, TLMessages.PutFullData, TLMessages.PutPartialData), 
+            TLMessages.Get
+        )
+        bankToL2A.bits.address := Cat(bank.io.mem_req_addr, 0.U(4.W))
+        bankToL2A.bits.mask    := bank.io.mem_req_byteen
+        bankToL2A.bits.data    := bank.io.mem_req_data
+        bankToL2A.bits.source  := bank.io.mem_req_tag
+
+        bankToL2A.bits.param   := 0.U
+        bankToL2A.bits.size    := 4.U
+        bankToL2A.bits.corrupt := false.B
+
+        // we ignore param, size, corrupt fields
+
+        // L2 -> bank response on channel D
+        val bankToL2D = bankToL2Bundle.d;
+
+        bankToL2D.ready := bank.io.mem_rsp_ready
+        bank.io.mem_rsp_valid := (bankToL2D.valid && (bankToL2D.bits.opcode === TLMessages.AccessAckData)) // need to ignore AccessAcks
+
+        bank.io.mem_rsp_tag  := bankToL2D.bits.source
+        bank.io.mem_rsp_data := bankToL2D.bits.data
+
+    }
+
+    TLReq2VXReq
+    VXReq2TLReq
+
+}
+
+class VX_Cache (
+    CACHE_ID: Int = 0,
+    CACHE_SIZE: Int = 16384,
+    CACHE_LINE_SIZE: Int = 16,
+    NUM_PORTS: Int = 1, 
+    WORD_SIZE: Int = 16, // hack - one "word" is enough to satisfy all 4 warps after decoalescing.
+    CREQ_SIZE: Int = 0,
+    CRSQ_SIZE: Int = 2,
+    MSHR_SIZE: Int = 8,
+    MRSQ_SIZE: Int = 0,
+    MREQ_SIZE: Int = 4,
+    WRITE_ENABLE: Int = 1,
+    CORE_TAG_WIDTH: Int = 10, // source ID ranges from 0 to 1 << 10
+    CORE_TAG_ID_BITS: Int = 5, // no idea what this is, just match it with default L1 dcache
+    BANK_ADDR_OFFSET: Int = 0,
+    NC_ENABLE: Int = 1, // Unsure what this does, but it's elaborated as 1 in default L1 setup so hopefully this is ok
+    WORD_ADDR_WIDTH: Int = 28, // 16 byte "word" = 4 bits
+    MEM_TAG_WIDTH: Int = 14, // Elaborated value is also completely different from (32 - log2Ceil(CACHE_LINE_SIZE)). This should match with sourceIds on client node associated with this cache
+    MEM_ADDR_WIDTH: Int = 28 // 16 byte cache line = 4 bits
+) extends BlackBox (
+    Map(
+        "CACHE_ID" -> CACHE_ID,
+        "NUM_REQS" -> 1, //Force NUM_REQS to be 1, we use their Cache as our individual Bank
+        "CACHE_SIZE" -> CACHE_SIZE,
+        "CACHE_LINE_SIZE" -> CACHE_LINE_SIZE,
+        "NUM_PORTS" -> NUM_PORTS,
+        "WORD_SIZE" -> WORD_SIZE,
+        "CREQ_SIZE" -> CREQ_SIZE,
+        "CRSQ_SIZE" -> CRSQ_SIZE,
+        "MSHR_SIZE" -> MSHR_SIZE,
+        "MRSQ_SIZE" -> MRSQ_SIZE,
+        "MREQ_SIZE" -> MREQ_SIZE,
+        "WRITE_ENABLE" -> WRITE_ENABLE,
+        "CORE_TAG_WIDTH" -> CORE_TAG_WIDTH,
+        "CORE_TAG_ID_BITS" -> CORE_TAG_ID_BITS,
+        "MEM_TAG_WIDTH" -> MEM_TAG_WIDTH,
+        "BANK_ADDR_OFFSET" -> BANK_ADDR_OFFSET,
+        "NC_ENABLE" -> NC_ENABLE,
+    )
+) with HasBlackBoxResource {
+
+    val io = IO(new Bundle {
+        val clk = Input(Clock())
+        val reset = Input(Reset())
+
+        // We should be able to turn the following into TileLink easily
+
+        // CACHE <> CORE
+        val core_req_valid = Input(Bool())
+        val core_req_rw = Input(Bool())
+        val core_req_addr = Input(UInt(WORD_ADDR_WIDTH.W))
+        val core_req_byteen = Input(UInt(WORD_SIZE.W))
+        val core_req_data = Input(UInt((WORD_SIZE * 8).W))
+        val core_req_tag = Input(UInt(CORE_TAG_WIDTH.W))
+        val core_req_ready = Output(Bool())
+
+        val core_rsp_valid = Output(Bool())  // 1 bit wide
+        val core_rsp_tmask = Output(Bool())  // 1 bit wide, probably can ignore (check waveform)
+        val core_rsp_data = Output(UInt((WORD_SIZE * 8).W))
+        val core_rsp_tag = Output(UInt(CORE_TAG_WIDTH.W))
+        val core_rsp_ready = Input(Bool())
+
+        // CACHE <> L2
+        val mem_req_valid = Output(Bool())
+        val mem_req_rw = Output(Bool())
+        val mem_req_byteen = Output(UInt(CACHE_LINE_SIZE.W))
+        val mem_req_addr = Output(UInt(MEM_ADDR_WIDTH.W))
+        val mem_req_data = Output(UInt((CACHE_LINE_SIZE * 8).W))
+        val mem_req_tag = Output(UInt(MEM_TAG_WIDTH.W))
+        val mem_req_ready = Input(Bool())
+
+        val mem_rsp_valid = Input(Bool())
+        val mem_rsp_data = Input(UInt((CACHE_LINE_SIZE * 8).W))
+        val mem_rsp_tag = Input(UInt(MEM_TAG_WIDTH.W))
+        val mem_rsp_ready = Output(Bool())
+    })
+
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_define.vh")
+    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_mux.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_lzc.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_fifo_queue.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_scan.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_find_first.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_multiplier.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_bits_remove.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_register.sv")
+    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_mux.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_encoder.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_reset_relay.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_popcount.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_bits_insert.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_skid_buffer.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_fixed_arbiter.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_shift_register.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_index_buffer.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_encoder.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_matrix_arbiter.sv")
+    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_divider.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_dp_ram.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_elastic_buffer.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_rr_arbiter.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_arbiter.sv")
+    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_bypass_buffer.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_sp_ram.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_demux.sv")
+    
+    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_index_queue.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_serial_div.sv")
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_fair_arbiter.sv")
+
+    addResource("/vsrc/vortex/hw/rtl/VX_define.vh")
+    addResource("/vsrc/vortex/hw/VX_config.h")
+
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_icache_rsp_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_dcache_req_if.sv")
+
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_cache_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_memsys_if.sv")
+ 
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_dcache_rsp_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_icache_req_if.sv")
+ 
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_mem_rsp_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_mem_req_if.sv")
+    
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_shared_mem.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_core_rsp_merge.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_tag_access.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_core_req_bank_sel.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_bank.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_data_access.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_flush_ctrl.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_nc_bypass.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_miss_resrv.sv")
+    addResource("/vsrc/vortex/hw/rtl/cache/VX_cache.sv")
+
+}

From e50903ed42833aebc8aa967f54a8d91d4cd100ef Mon Sep 17 00:00:00 2001
From: Vamber Yang <vamber@berkeley.edu>
Date: Mon, 16 Oct 2023 10:31:31 -0700
Subject: [PATCH 2/3] VX_FatBank runs in SoC Config with Coalescer till
 termination Issues addressed: 1. FatBank ack to downstream coalescer with the
 correct size on ChannelD 2. FatBank ack to downstream coalescer immediately
 after W Req 3. FatBank generates unique ID for W Req to L2 4. Allows
 coalescer to config max Coal to L1 ReadSize at compile time

Ungoing issues:
1. Magic Number
2. Verification
3. Multi-Bank Integration
---
 src/main/scala/rocket/VortexFatBank.scala     | 359 ++++++++++++++----
 .../scala/tilelink/CanHaveMemtraceCore.scala  |  35 +-
 src/main/scala/tilelink/Coalescing.scala      |   1 +
 3 files changed, 313 insertions(+), 82 deletions(-)

diff --git a/src/main/scala/rocket/VortexFatBank.scala b/src/main/scala/rocket/VortexFatBank.scala
index c1b6709..9481af8 100644
--- a/src/main/scala/rocket/VortexFatBank.scala
+++ b/src/main/scala/rocket/VortexFatBank.scala
@@ -1,4 +1,5 @@
-package rocket
+//package freechips.rocketchip.rocket
+package freechips.rocketchip.tilelink
 
 import chisel3._
 import chisel3.util._
@@ -6,125 +7,261 @@ import chisel3.experimental._
 import org.chipsalliance.cde.config.Parameters
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.tilelink._
+import org.chipsalliance.cde.config.{Parameters, Field}
+
 
 // VortexTile has dmemNodes, imemNodes
 
-class VortexFatBank (
+//Param and Key are used during SoC Generation
 
-) (implicit p: Parameters) extends LazyModule {
+case class VortexFatBankParam(wordSize: Int = 16, busWidthInBytes: Int = 8)
+case object VortexFatBankKey extends Field[Option[VortexFatBankConfig]](None /*default*/)
+
+case class VortexFatBankConfig(
+    wordSize: Int,      //This is the read/write granularity of the L1 cache
+    cacheLineSize: Int,
+    coreTagWidth: Int,
+) {
+    def coreTagPlusSizeWidth: Int = {
+        log2Ceil(wordSize) + coreTagWidth
+    }
+}
+
+object defaultFatBankConfig extends VortexFatBankConfig(
+    wordSize = 16,
+    cacheLineSize = 16,
+    coreTagWidth = 8,
+)
+
+
+class VortexFatBank (config: VortexFatBankConfig) (implicit p: Parameters) extends LazyModule {
 
     val clientParam = Seq(TLMasterPortParameters.v1(
         clients = Seq(
             TLMasterParameters.v1(
                 name = "VortexFatBank",
                 sourceId = IdRange(0, 1 << 14), // FIXME: magic number
-                supportsProbe = TransferSizes(1, 16),
-                supportsGet = TransferSizes(1, 16),
-                supportsPutFull = TransferSizes(1, 16),
-                supportsPutPartial = TransferSizes(1, 16)
+                supportsProbe = TransferSizes(1, config.wordSize),
+                supportsGet = TransferSizes(1, config.wordSize),
+                supportsPutFull = TransferSizes(1, config.wordSize),
+                supportsPutPartial = TransferSizes(1, config.wordSize)
             )
         )
     ))
 
     val managerParam = Seq(TLSlavePortParameters.v1(
-        beatBytes = 16,
+        beatBytes = config.wordSize,
         managers = Seq(
             TLSlaveParameters.v1(
-                address            = Seq(AddressSet(0x0, 0xffffff)), // intercept all requests (it does not like all F for some reason)
+                address            = Seq(AddressSet(0x80000000L, 0xfffffff)), // 0x80000000 -> 0x90000000 are possible address tracer can emit
                 regionType         = RegionType.IDEMPOTENT, // idk what this does
                 executable         = false,
-                supportsGet        = TransferSizes(1, 16),
-                supportsPutPartial = TransferSizes(1, 16),
-                supportsPutFull    = TransferSizes(1, 16),
+                supportsGet        = TransferSizes(1, config.wordSize),
+                supportsPutPartial = TransferSizes(1, config.wordSize),
+                supportsPutFull    = TransferSizes(1, config.wordSize),
                 fifoId             = Some(0)
             )
         )
     ))
 
-    val coalToBankNode = TLManagerNode(managerParam)
-    val bankToL2Node = TLClientNode(clientParam)
-    lazy val module = new VortexFatBankImp(this);
+    val coalToVxCacheNode = TLManagerNode(managerParam)
+    val vxCacheToL2Node = TLIdentityNode()
+    val vxCacheFetchNode = TLClientNode(clientParam)
+    
+    //We need this widthWidget here, because whenever the fatBank is performing
+    //read and write to Mem, it must have the illusion that dataWidth is as big as
+    //as its cacheline size
+    vxCacheToL2Node := TLWidthWidget(config.cacheLineSize) := vxCacheFetchNode
+    lazy val module = new VortexFatBankImp(this, config);
 }
 
 class VortexFatBankImp (
-    outer: VortexFatBank
+    outer: VortexFatBank,
+    config: VortexFatBankConfig
 ) extends LazyModuleImp(outer) {
-    val bank = Module(new VX_Cache());
 
-    bank.io.clk := clock
-    bank.io.reset := reset
+    val vxCache = Module(new VX_cache(
+        WORD_SIZE=config.wordSize, 
+        CACHE_LINE_SIZE=config.cacheLineSize,
+        CORE_TAG_WIDTH= config.coreTagPlusSizeWidth
+        )
+    );
 
-    // Translate TL request from Coalescer to requests for VX_Cache
+    vxCache.io.clk := clock
+    vxCache.io.reset := reset
+
+    class WriteReqInfo extends Bundle {
+        val id = UInt(32.W)
+        val size = UInt(32.W)
+    }
+    
+
+    //<FIXME> assuming this is never full
+    val rcvWriteReqInfo = Module(new Queue((new WriteReqInfo).cloneType, 64, true, false))
+    
+    class ReadReqInfo(config: VortexFatBankConfig) extends Bundle {
+        val size = UInt(log2Ceil(config.wordSize).W)
+        val id   = UInt(config.coreTagWidth.W)
+    }
+
+    val readReqInfo = Wire(new ReadReqInfo(config))
+
+    // Translate TL request from Coalescer to requests for VX_cache
     def TLReq2VXReq = {
-        val (coalToBankBundle, _) = outer.coalToBankNode.in.head
-        // coal -> bank request on channel A
+        val (coalToBankBundle, _) = outer.coalToVxCacheNode.in.head
+        // coal -> vxCache request on channel A
         val coalToBankA = coalToBankBundle.a;
         
-        coalToBankA.ready := bank.io.core_req_ready
-        bank.io.core_req_valid := coalToBankA.valid
+        coalToBankA.ready := vxCache.io.core_req_ready
+        vxCache.io.core_req_valid := coalToBankA.valid
 
         // read = 0, write = 1
-        bank.io.core_req_rw     := !(coalToBankA.bits.opcode === TLMessages.Get)
-        bank.io.core_req_addr   := coalToBankA.bits.address(31, 4)
-        bank.io.core_req_byteen := coalToBankA.bits.mask
-        bank.io.core_req_data   := coalToBankA.bits.data
-        bank.io.core_req_tag    := coalToBankA.bits.source
-
+        vxCache.io.core_req_rw     := !(coalToBankA.bits.opcode === TLMessages.Get)
+        //4 is also hardcoded, it should be log2WordSize
+        vxCache.io.core_req_addr   := coalToBankA.bits.address(31, 4)
+        vxCache.io.core_req_byteen := coalToBankA.bits.mask
+        vxCache.io.core_req_data   := coalToBankA.bits.data
+        
+        readReqInfo.id   := coalToBankA.bits.source
+        readReqInfo.size := coalToBankA.bits.size
+        vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag)
+        
+        
         // we ignore param, size, corrupt fields
 
-        // bank -> coal response on channel D
+        // vxCache -> coal response on channel D
+        // ok ... this part is a little tricky, the downstream coalescer requires the L1 cache
+        // to send ack and dataAck, this is how coalescer knows when an inflight ID has retired
+        // if we don't send ack, the coalescer will run out of IDs, and can't generate new request
+
+        // for read request, we send AckData when the FatBank has a valid output
+        // for write request, we can immediate Ack on the next clock cycle (not the same clock cycle, otherwise critical path too long)
+        // It's possible that on the same cycle, we need to do both "AckData" and "Ack"
+        //    in this case, we always priorize "Ack", this makes the design easier
+
+        //I think this just shows the flaws of Tilelink. CPU never waits for an Ack upon regular write request
+        //the Core should unconditionally move forward after every regular write request
+
         val coalToBankD = coalToBankBundle.d;
 
-        bank.io.core_rsp_ready := coalToBankD.ready
-        coalToBankD.valid := bank.io.core_rsp_valid
 
-        // Cache is not TL compliant since we don't generate AccessAcks on successful writes
-        // but VortexCore is not expecting them anyways
-        coalToBankD.bits.opcode  := TLMessages.AccessAckData
+        //<FIXME> currently assuming below buffer is never full
+        rcvWriteReqInfo.io.enq.valid     := !(coalToBankA.bits.opcode === TLMessages.Get) && coalToBankA.valid && coalToBankA.ready
+        rcvWriteReqInfo.io.enq.bits.id   := coalToBankA.bits.source
+        rcvWriteReqInfo.io.enq.bits.size := coalToBankA.bits.size
+
+
+        rcvWriteReqInfo.io.deq.ready := coalToBankD.ready
+        
+        //if we "need" to do Ack
+        //we unconditionally set the vxCache.ready to be false, so it gets delayed
+        vxCache.io.core_rsp_ready := Mux(
+            rcvWriteReqInfo.io.deq.valid,
+            false.B,
+            coalToBankD.ready
+        )
+
+        coalToBankD.valid := Mux(
+            rcvWriteReqInfo.io.deq.valid,
+            true.B,
+            vxCache.io.core_rsp_valid
+        )
+
+        coalToBankD.bits.source := Mux(
+            rcvWriteReqInfo.io.deq.valid,
+            rcvWriteReqInfo.io.deq.bits.id,
+            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id
+        )
+
+        coalToBankD.bits.opcode  := Mux(
+            rcvWriteReqInfo.io.deq.valid,
+            TLMessages.AccessAck,
+            TLMessages.AccessAckData
+        )
+
+        coalToBankD.bits.size := Mux(
+            rcvWriteReqInfo.io.deq.valid,
+            rcvWriteReqInfo.io.deq.bits.size,
+            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size
+        )
+
         coalToBankD.bits.param   := 0.U
-        coalToBankD.bits.size    := 4.U
         coalToBankD.bits.sink    := 0.U
         coalToBankD.bits.denied  := false.B
         coalToBankD.bits.corrupt := false.B
 
-        coalToBankD.bits.data   := bank.io.core_rsp_data
-        coalToBankD.bits.source := bank.io.core_rsp_tag
+        coalToBankD.bits.data   := vxCache.io.core_rsp_data
     }
 
-    // Translate VX_Cache mem request to a TL request to be sent to L2
+
+    //Using Hansung's Source Generator
+    //Why do we need to do this, what is the issue ?
+    //Tilelink requires all inflight Read and Write Request to have a unique source_ID
+    //vx_cache can indeed guarantee that all active read operation has unique ID
+    //However, since the cache is write_through, so it can't ensure unique ID for write operation
+    //Therefore, we need our own internal source_ID generator for all write operation
+    //
+    //Now, we allocate id range: 0-15 for all write operation
+    //                    and    16-> above for read operation
+    val sourceGen = Module( new SourceGenerator(log2Ceil(16), ignoreInUse = false))
+
+
+    // Translate VX_cache mem request to a TL request to be sent to L2
     def VXReq2TLReq = {
-        val (bankToL2Bundle, _) = outer.bankToL2Node.out.head
-        // bank -> L2 request on channel A
-        val bankToL2A = bankToL2Bundle.a;
+        val (vxCacheToL2Bundle, _) = outer.vxCacheFetchNode.out.head
+        // vxCache -> L2 request on channel A
+        val vxCacheToL2A = vxCacheToL2Bundle.a;
         
 
-        bank.io.mem_req_ready := bankToL2A.ready
-        bankToL2A.valid := bank.io.mem_req_valid 
+        //Read Operation is ready as long as downstream L2 is ready
 
-        bankToL2A.bits.opcode := Mux(
-            bank.io.mem_req_rw, 
-            Mux(bank.io.mem_req_byteen.andR, TLMessages.PutFullData, TLMessages.PutPartialData), 
+        vxCache.io.mem_req_ready := vxCacheToL2A.ready
+
+        vxCacheToL2A.valid := Mux(
+            vxCache.io.mem_req_rw,
+            vxCache.io.mem_req_valid && sourceGen.io.id.valid,
+            vxCache.io.mem_req_valid
+        )
+
+        vxCacheToL2A.bits.opcode := Mux(
+            vxCache.io.mem_req_rw, 
+            Mux(vxCache.io.mem_req_byteen.andR, TLMessages.PutFullData, TLMessages.PutPartialData), 
             TLMessages.Get
         )
-        bankToL2A.bits.address := Cat(bank.io.mem_req_addr, 0.U(4.W))
-        bankToL2A.bits.mask    := bank.io.mem_req_byteen
-        bankToL2A.bits.data    := bank.io.mem_req_data
-        bankToL2A.bits.source  := bank.io.mem_req_tag
 
-        bankToL2A.bits.param   := 0.U
-        bankToL2A.bits.size    := 4.U
-        bankToL2A.bits.corrupt := false.B
+        vxCacheToL2A.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W))
+        vxCacheToL2A.bits.mask    := Mux(
+            vxCache.io.mem_req_rw, 
+            vxCache.io.mem_req_byteen,
+            0xFFFF.U
+        )
+        vxCacheToL2A.bits.data    := vxCache.io.mem_req_data
+
+        
+        vxCacheToL2A.bits.source  := Mux(
+            vxCache.io.mem_req_rw,
+            sourceGen.io.id.bits,
+            vxCache.io.mem_req_tag + 16.U
+        )
+        //mark current source_id as in-use
+        sourceGen.io.gen := vxCache.io.mem_req_rw && vxCacheToL2A.ready && vxCacheToL2A.valid
+
+        vxCacheToL2A.bits.param   := 0.U
+        vxCacheToL2A.bits.size    := 4.U
+        vxCacheToL2A.bits.corrupt := false.B
 
         // we ignore param, size, corrupt fields
 
-        // L2 -> bank response on channel D
-        val bankToL2D = bankToL2Bundle.d;
+        // L2 -> vxCache response on channel D
+        val vxCacheToL2D = vxCacheToL2Bundle.d;
+        vxCacheToL2D.ready := vxCache.io.mem_rsp_ready
 
-        bankToL2D.ready := bank.io.mem_rsp_ready
-        bank.io.mem_rsp_valid := (bankToL2D.valid && (bankToL2D.bits.opcode === TLMessages.AccessAckData)) // need to ignore AccessAcks
+        vxCache.io.mem_rsp_valid := vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAckData
+        vxCache.io.mem_rsp_tag   := vxCacheToL2D.bits.source - 16.U // -16 for read resp, we can safely do this, since write-ack wouldn't pass through
+        vxCache.io.mem_rsp_data := vxCacheToL2D.bits.data
 
-        bank.io.mem_rsp_tag  := bankToL2D.bits.source
-        bank.io.mem_rsp_data := bankToL2D.bits.data
+        sourceGen.io.reclaim.valid := vxCacheToL2D.ready && vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAck
+        sourceGen.io.reclaim.bits := vxCacheToL2D.bits.source
 
     }
 
@@ -133,7 +270,7 @@ class VortexFatBankImp (
 
 }
 
-class VX_Cache (
+class VX_cache (
     CACHE_ID: Int = 0,
     CACHE_SIZE: Int = 16384,
     CACHE_LINE_SIZE: Int = 16,
@@ -145,10 +282,10 @@ class VX_Cache (
     MRSQ_SIZE: Int = 0,
     MREQ_SIZE: Int = 4,
     WRITE_ENABLE: Int = 1,
-    CORE_TAG_WIDTH: Int = 10, // source ID ranges from 0 to 1 << 10
+    CORE_TAG_WIDTH: Int = 10, // source ID ranges from 0 to 1 << 10, we need to allocate upper bits to save size
     CORE_TAG_ID_BITS: Int = 5, // no idea what this is, just match it with default L1 dcache
     BANK_ADDR_OFFSET: Int = 0,
-    NC_ENABLE: Int = 1, // Unsure what this does, but it's elaborated as 1 in default L1 setup so hopefully this is ok
+    NC_ENABLE: Int = 0, //NC_ENABLE=1 means the cache becomes a passthrough
     WORD_ADDR_WIDTH: Int = 28, // 16 byte "word" = 4 bits
     MEM_TAG_WIDTH: Int = 14, // Elaborated value is also completely different from (32 - log2Ceil(CACHE_LINE_SIZE)). This should match with sourceIds on client node associated with this cache
     MEM_ADDR_WIDTH: Int = 28 // 16 byte cache line = 4 bits
@@ -210,8 +347,33 @@ class VX_Cache (
         val mem_rsp_ready = Output(Bool())
     })
 
+
+    addResource("/vsrc/vortex/hw/rtl/VX_dispatch.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_issue.sv")
     addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_define.vh")
-    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_mux.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_warp_sched.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_sat.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_stride.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_lerp.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_addr.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_mem.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_format.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_sampler.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_unit.sv")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_define.vh")
+    addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_wrap.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_scope.vh")
+    addResource("/vsrc/vortex/hw/rtl/VX_fpu_unit.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_scoreboard.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_writeback.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_muldiv.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_decode.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_ibuffer.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_icache_stage.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_gpu_unit.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_trace_instr.vh")
+    addResource("/vsrc/vortex/hw/rtl/VX_gpu_types.vh")
+    addResource("/vsrc/vortex/hw/rtl/VX_config.vh")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_lzc.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_fifo_queue.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_scan.sv")
@@ -219,7 +381,6 @@ class VX_Cache (
     addResource("/vsrc/vortex/hw/rtl/libs/VX_multiplier.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_bits_remove.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_register.sv")
-    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_mux.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_encoder.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_reset_relay.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_popcount.sv")
@@ -230,36 +391,80 @@ class VX_Cache (
     addResource("/vsrc/vortex/hw/rtl/libs/VX_index_buffer.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_encoder.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_matrix_arbiter.sv")
-    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_divider.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_dp_ram.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_elastic_buffer.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_rr_arbiter.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_arbiter.sv")
-    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_bypass_buffer.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_sp_ram.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_demux.sv")
-    
-    // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_index_queue.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_serial_div.sv")
     addResource("/vsrc/vortex/hw/rtl/libs/VX_fair_arbiter.sv")
-
+    addResource("/vsrc/vortex/hw/rtl/libs/VX_pending_size.sv")
     addResource("/vsrc/vortex/hw/rtl/VX_define.vh")
+    addResource("/vsrc/vortex/hw/rtl/VX_csr_data.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_cache_arb.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_ipdom_stack.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_gpr_stage.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_execute.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_fetch.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_alu_unit.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_platform.vh")
+    addResource("/vsrc/vortex/hw/rtl/VX_commit.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_pipeline.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_lsu_unit.sv")
+    addResource("/vsrc/vortex/hw/rtl/VX_csr_unit.sv")
     addResource("/vsrc/vortex/hw/VX_config.h")
-
+    addResource("/vsrc/vortex/sim/common/rvfloats.h")
+    addResource("/vsrc/vortex/sim/common/rvfloats.cpp")
+    addResource("/csrc/softfloat/include/internals.h")
+    addResource("/csrc/softfloat/include/primitives.h")
+    addResource("/csrc/softfloat/include/primitiveTypes.h")
+    addResource("/csrc/softfloat/include/softfloat.h")
+    addResource("/csrc/softfloat/include/softfloat_types.h")
+    addResource("/csrc/softfloat/RISCV/specialize.h")
+    addResource("/vsrc/vortex/hw/dpi/float_dpi.cpp")
+    addResource("/vsrc/vortex/hw/dpi/float_dpi.vh")
+    addResource("/vsrc/vortex/hw/dpi/util_dpi.cpp")
+    addResource("/vsrc/vortex/hw/dpi/util_dpi.vh")
+    addResource("/vsrc/vortex/hw/rtl/fp_cores/VX_fpu_dpi.sv")
+    addResource("/vsrc/vortex/hw/rtl/fp_cores/VX_fpu_define.vh")
+    addResource("/vsrc/vortex/hw/rtl/fp_cores/VX_fpu_types.vh")
     addResource("/vsrc/vortex/hw/rtl/interfaces/VX_icache_rsp_if.sv")
     addResource("/vsrc/vortex/hw/rtl/interfaces/VX_dcache_req_if.sv")
-
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_tex_csr_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_join_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_ifetch_req_if.sv")
     addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_cache_if.sv")
     addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_memsys_if.sv")
- 
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_gpr_req_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_decode_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_writeback_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_gpu_req_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_pipeline_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_gpr_rsp_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_cmt_to_csr_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_csr_to_alu_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_ifetch_rsp_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_alu_req_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_csr_req_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_ibuffer_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_branch_ctl_if.sv")
     addResource("/vsrc/vortex/hw/rtl/interfaces/VX_dcache_rsp_if.sv")
     addResource("/vsrc/vortex/hw/rtl/interfaces/VX_icache_req_if.sv")
- 
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_lsu_req_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_wstall_if.sv")
     addResource("/vsrc/vortex/hw/rtl/interfaces/VX_mem_rsp_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_fpu_to_csr_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_commit_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_tex_req_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_warp_ctl_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_tex_rsp_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_fetch_to_csr_if.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_tex_if.sv")
     addResource("/vsrc/vortex/hw/rtl/interfaces/VX_mem_req_if.sv")
-    
-    addResource("/vsrc/vortex/hw/rtl/cache/VX_shared_mem.sv")
+    addResource("/vsrc/vortex/hw/rtl/interfaces/VX_fpu_req_if.sv")
+    //addResource("/vsrc/vortex/hw/rtl/cache/VX_shared_mem.sv")
     addResource("/vsrc/vortex/hw/rtl/cache/VX_core_rsp_merge.sv")
     addResource("/vsrc/vortex/hw/rtl/cache/VX_tag_access.sv")
     addResource("/vsrc/vortex/hw/rtl/cache/VX_core_req_bank_sel.sv")
diff --git a/src/main/scala/tilelink/CanHaveMemtraceCore.scala b/src/main/scala/tilelink/CanHaveMemtraceCore.scala
index 0bcbd70..51e45c6 100644
--- a/src/main/scala/tilelink/CanHaveMemtraceCore.scala
+++ b/src/main/scala/tilelink/CanHaveMemtraceCore.scala
@@ -3,6 +3,8 @@ package freechips.rocketchip.tilelink
 import freechips.rocketchip.diplomacy.LazyModule
 import freechips.rocketchip.subsystem.BaseSubsystem
 import org.chipsalliance.cde.config.Parameters
+import freechips.rocketchip.rocket
+
 
 // TODO: possibly move to somewhere closer to CoalescingUnit
 // TODO: separate coalescer config from CanHaveMemtraceCore
@@ -44,16 +46,39 @@ trait CanHaveMemtraceCore { this: BaseSubsystem =>
       }
       case None => tracer.node
     }
-    val upstream = p(CoalXbarKey) match {
+    val coalXbar = p(CoalXbarKey) match {
       case Some(xbarParam) =>{
-        val priorityXbar = LazyModule(new CoalescerTLPriortyXBar)
-        println(s"============ Using Priority XBar for Coalescer Requests ")
-        priorityXbar.node :=* coalescerNode
-        priorityXbar.node
+        val coXbar = LazyModule(new TLXbar)
+        println(s"============ Using TLXBar for Coalescer Requests ")
+        coXbar.node :=* coalescerNode
+        coXbar.node
       }
       case None => coalescerNode
     }
+
+    val vortexBank = p(VortexFatBankKey) match {
+      case Some(fatBankParam) =>{
+        val vx_fatbank = LazyModule(new VortexFatBank(fatBankParam))
+        println(s"============ Using Vortex FatBank as L1 ")
+        vx_fatbank.coalToVxCacheNode :=* coalXbar
+        vx_fatbank.vxCacheToL2Node
+      }
+      case None => coalXbar
+    }
     
+    
+    //If there is only 1 bank, the code below is useless
+    val upstream = p(CoalXbarKey) match {
+      case Some(xbarParam) =>{
+        val tileXbar = LazyModule(new TLXbar)
+        println(s"============ Using TLXBar for L1 Requests ")
+        tileXbar.node :=* vortexBank
+        tileXbar.node
+      }
+      case None => vortexBank
+    }
+  
+
     sbus.coupleFrom(s"gpu-tracer") { _ :=* upstream }
   }
 }
diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 1b24c20..217049a 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -774,6 +774,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
   println(s"    numNewSrcIds: ${config.numNewSrcIds}")
   println(s"    reqQueueDepth: ${config.queueDepth}")
   println(s"    respQueueDepth: ${config.respQueueDepth}")
+  println(s"    addressWidth: ${config.addressWidth}")
   println(s"}")
 
   require(

From 60a63d4e1116c8c22715b54b74621e3fcc2611c3 Mon Sep 17 00:00:00 2001
From: Vamber Yang <vamber@berkeley.edu>
Date: Sun, 22 Oct 2023 17:03:37 -0700
Subject: [PATCH 3/3] FatBank Integration Improvements: 1. ensure FatBank
 prioritze Ack read over Ack write to downstream    coalescer 2. Between
 FatBank and L2, use the new sourceGenerator to allow both Read and    Write
 Reqs sharing the same pool of available src_ids

---
 src/main/scala/rocket/VortexFatBank.scala | 180 +++++++++++++++-------
 1 file changed, 124 insertions(+), 56 deletions(-)

diff --git a/src/main/scala/rocket/VortexFatBank.scala b/src/main/scala/rocket/VortexFatBank.scala
index 9481af8..dddf348 100644
--- a/src/main/scala/rocket/VortexFatBank.scala
+++ b/src/main/scala/rocket/VortexFatBank.scala
@@ -10,6 +10,88 @@ import freechips.rocketchip.tilelink._
 import org.chipsalliance.cde.config.{Parameters, Field}
 
 
+
+
+//<FIXME> Delete the following NewSourceGenerator when merging with origin/graphics
+//we should just use the one in coalescing.scala written by hansung
+
+class NewSourceGenerator[T <: Data](
+  sourceWidth: Int,
+  metadata: Option[T] = None,
+  ignoreInUse: Boolean = false
+) extends Module {
+  def getMetadataType = metadata match {
+    case Some(gen) => gen.cloneType
+    case None => UInt(0.W)
+  }
+  val io = IO(new Bundle {
+    val gen = Input(Bool())
+    val reclaim = Input(Valid(UInt(sourceWidth.W)))
+    val id = Output(Valid(UInt(sourceWidth.W)))
+    // below are used only when metadata is not None
+    // `meta` is used as input when a request succeeds id generation to store
+    // its value to the table.
+    // `peek` is the retrieved metadata saved for the request when corresponding
+    // request has come back, setting `reclaim`.
+    // Although these do not use ValidIO, it is safe because any in-flight
+    // response coming back should have allocated a valid entry in the table
+    // when it went out.
+    val meta = Input(getMetadataType)
+    val peek = Output(getMetadataType)
+    // for debugging; indicates whether there is at least one inflight request
+    // that hasn't been reclaimed yet
+    val inflight = Output(Bool())
+  })
+  val head = RegInit(UInt(sourceWidth.W), 0.U)
+  head := Mux(io.gen, head + 1.U, head)
+
+  val outstanding = RegInit(UInt((sourceWidth + 1).W), 0.U)
+  io.inflight := (outstanding > 0.U) || io.gen
+
+  val numSourceId = 1 << sourceWidth
+  val row = new Bundle {
+    val meta = getMetadataType
+    val id = Valid(UInt(sourceWidth.W))
+  }
+  // valid: in use, invalid: available
+  // val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W)))
+  val occupancyTable = Mem(numSourceId, row)
+  when(reset.asBool) {
+    (0 until numSourceId).foreach { occupancyTable(_).id.valid := false.B }
+  }
+  val frees = (0 until numSourceId).map(!occupancyTable(_).id.valid)
+  val lowestFree = PriorityEncoder(frees)
+  val lowestFreeRow = occupancyTable(lowestFree)
+
+  io.id.valid := (if (ignoreInUse) true.B else !lowestFreeRow.id.valid)
+  io.id.bits := lowestFree
+  when(io.gen && io.id.valid /* fire */ ) {
+    occupancyTable(io.id.bits).id.valid := true.B // mark in use
+    if (metadata.isDefined) {
+      occupancyTable(io.id.bits).meta := io.meta
+    }
+  }
+  when(io.reclaim.valid) {
+    // @perf: would this require multiple write ports?
+    occupancyTable(io.reclaim.bits).id.valid := false.B // mark freed
+  }
+  io.peek := {
+    if (metadata.isDefined) occupancyTable(io.reclaim.bits).meta else 0.U
+  }
+
+  when(io.gen && io.id.valid) {
+    when (!io.reclaim.valid) {
+      assert(outstanding < (1 << sourceWidth).U)
+      outstanding := outstanding + 1.U
+    }
+  }.elsewhen(io.reclaim.valid) {
+    assert(outstanding > 0.U)
+    outstanding := outstanding - 1.U
+  }
+  dontTouch(outstanding)
+}
+
+
 // VortexTile has dmemNodes, imemNodes
 
 //Param and Key are used during SoC Generation
@@ -21,6 +103,8 @@ case class VortexFatBankConfig(
     wordSize: Int,      //This is the read/write granularity of the L1 cache
     cacheLineSize: Int,
     coreTagWidth: Int,
+    writeInfoReqQSize: Int,
+    mshrSize: Int
 ) {
     def coreTagPlusSizeWidth: Int = {
         log2Ceil(wordSize) + coreTagWidth
@@ -31,6 +115,8 @@ object defaultFatBankConfig extends VortexFatBankConfig(
     wordSize = 16,
     cacheLineSize = 16,
     coreTagWidth = 8,
+    writeInfoReqQSize = 16,
+    mshrSize = 8
 )
 
 
@@ -83,7 +169,8 @@ class VortexFatBankImp (
     val vxCache = Module(new VX_cache(
         WORD_SIZE=config.wordSize, 
         CACHE_LINE_SIZE=config.cacheLineSize,
-        CORE_TAG_WIDTH= config.coreTagPlusSizeWidth
+        CORE_TAG_WIDTH= config.coreTagPlusSizeWidth,
+        MSHR_SIZE= config.mshrSize
         )
     );
 
@@ -94,16 +181,13 @@ class VortexFatBankImp (
         val id = UInt(32.W)
         val size = UInt(32.W)
     }
-    
 
-    //<FIXME> assuming this is never full
-    val rcvWriteReqInfo = Module(new Queue((new WriteReqInfo).cloneType, 64, true, false))
-    
     class ReadReqInfo(config: VortexFatBankConfig) extends Bundle {
         val size = UInt(log2Ceil(config.wordSize).W)
         val id   = UInt(config.coreTagWidth.W)
     }
 
+    val rcvWriteReqInfo = Module(new Queue((new WriteReqInfo).cloneType, config.writeInfoReqQSize, true, false))
     val readReqInfo = Wire(new ReadReqInfo(config))
 
     // Translate TL request from Coalescer to requests for VX_cache
@@ -112,16 +196,17 @@ class VortexFatBankImp (
         // coal -> vxCache request on channel A
         val coalToBankA = coalToBankBundle.a;
         
-        coalToBankA.ready := vxCache.io.core_req_ready
+        coalToBankA.ready := vxCache.io.core_req_ready && rcvWriteReqInfo.io.enq.ready //not optimal
         vxCache.io.core_req_valid := coalToBankA.valid
 
         // read = 0, write = 1
         vxCache.io.core_req_rw     := !(coalToBankA.bits.opcode === TLMessages.Get)
         //4 is also hardcoded, it should be log2WordSize
-        vxCache.io.core_req_addr   := coalToBankA.bits.address(31, 4)
+        vxCache.io.core_req_addr   := coalToBankA.bits.address(31, log2Ceil(config.wordSize))
         vxCache.io.core_req_byteen := coalToBankA.bits.mask
         vxCache.io.core_req_data   := coalToBankA.bits.data
         
+        //combine size and tag field into one big wire, to put into vxCache.io.core_req_tag
         readReqInfo.id   := coalToBankA.bits.source
         readReqInfo.size := coalToBankA.bits.size
         vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag)
@@ -135,9 +220,7 @@ class VortexFatBankImp (
         // if we don't send ack, the coalescer will run out of IDs, and can't generate new request
 
         // for read request, we send AckData when the FatBank has a valid output
-        // for write request, we can immediate Ack on the next clock cycle (not the same clock cycle, otherwise critical path too long)
-        // It's possible that on the same cycle, we need to do both "AckData" and "Ack"
-        //    in this case, we always priorize "Ack", this makes the design easier
+        // for write request, we can ack whenever we have a valid entry in rcvWriteReqInfo Queue
 
         //I think this just shows the flaws of Tilelink. CPU never waits for an Ack upon regular write request
         //the Core should unconditionally move forward after every regular write request
@@ -150,39 +233,29 @@ class VortexFatBankImp (
         rcvWriteReqInfo.io.enq.bits.id   := coalToBankA.bits.source
         rcvWriteReqInfo.io.enq.bits.size := coalToBankA.bits.size
 
-
-        rcvWriteReqInfo.io.deq.ready := coalToBankD.ready
+        //prioritize Ack for Read, so we only deque from writeReqInfo, if we don't have a readReq we need to ack
+        //vxCache.io.core_rsp_valid means readDataAck
+        rcvWriteReqInfo.io.deq.ready := coalToBankD.ready && ~vxCache.io.core_rsp_valid 
         
-        //if we "need" to do Ack
-        //we unconditionally set the vxCache.ready to be false, so it gets delayed
-        vxCache.io.core_rsp_ready := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            false.B,
-            coalToBankD.ready
-        )
-
-        coalToBankD.valid := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            true.B,
-            vxCache.io.core_rsp_valid
-        )
+        vxCache.io.core_rsp_ready := coalToBankD.ready
+        coalToBankD.valid := vxCache.io.core_rsp_valid || rcvWriteReqInfo.io.deq.valid
 
         coalToBankD.bits.source := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            rcvWriteReqInfo.io.deq.bits.id,
-            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id
+            vxCache.io.core_rsp_valid,
+            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id,
+            rcvWriteReqInfo.io.deq.bits.id
         )
 
         coalToBankD.bits.opcode  := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            TLMessages.AccessAck,
-            TLMessages.AccessAckData
+            vxCache.io.core_rsp_valid,
+            TLMessages.AccessAckData,
+            TLMessages.AccessAck
         )
 
         coalToBankD.bits.size := Mux(
-            rcvWriteReqInfo.io.deq.valid,
-            rcvWriteReqInfo.io.deq.bits.size,
-            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size
+            vxCache.io.core_rsp_valid,
+            vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size,
+            rcvWriteReqInfo.io.deq.bits.size
         )
 
         coalToBankD.bits.param   := 0.U
@@ -190,7 +263,7 @@ class VortexFatBankImp (
         coalToBankD.bits.denied  := false.B
         coalToBankD.bits.corrupt := false.B
 
-        coalToBankD.bits.data   := vxCache.io.core_rsp_data
+        coalToBankD.bits.data    := vxCache.io.core_rsp_data
     }
 
 
@@ -200,10 +273,10 @@ class VortexFatBankImp (
     //vx_cache can indeed guarantee that all active read operation has unique ID
     //However, since the cache is write_through, so it can't ensure unique ID for write operation
     //Therefore, we need our own internal source_ID generator for all write operation
-    //
-    //Now, we allocate id range: 0-15 for all write operation
-    //                    and    16-> above for read operation
-    val sourceGen = Module( new SourceGenerator(log2Ceil(16), ignoreInUse = false))
+    
+
+    val sourceGen = Module( new NewSourceGenerator(log2Ceil(config.mshrSize), metadata = Some(UInt(32.W)), ignoreInUse = false))
+    
 
 
     // Translate VX_cache mem request to a TL request to be sent to L2
@@ -215,13 +288,9 @@ class VortexFatBankImp (
 
         //Read Operation is ready as long as downstream L2 is ready
 
-        vxCache.io.mem_req_ready := vxCacheToL2A.ready
+        vxCache.io.mem_req_ready := vxCacheToL2A.ready 
 
-        vxCacheToL2A.valid := Mux(
-            vxCache.io.mem_req_rw,
-            vxCache.io.mem_req_valid && sourceGen.io.id.valid,
-            vxCache.io.mem_req_valid
-        )
+        vxCacheToL2A.valid := vxCache.io.mem_req_valid && sourceGen.io.id.valid
 
         vxCacheToL2A.bits.opcode := Mux(
             vxCache.io.mem_req_rw, 
@@ -230,21 +299,19 @@ class VortexFatBankImp (
         )
 
         vxCacheToL2A.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W))
+
         vxCacheToL2A.bits.mask    := Mux(
             vxCache.io.mem_req_rw, 
             vxCache.io.mem_req_byteen,
             0xFFFF.U
         )
+
         vxCacheToL2A.bits.data    := vxCache.io.mem_req_data
 
-        
-        vxCacheToL2A.bits.source  := Mux(
-            vxCache.io.mem_req_rw,
-            sourceGen.io.id.bits,
-            vxCache.io.mem_req_tag + 16.U
-        )
-        //mark current source_id as in-use
-        sourceGen.io.gen := vxCache.io.mem_req_rw && vxCacheToL2A.ready && vxCacheToL2A.valid
+        vxCacheToL2A.bits.source  := sourceGen.io.id.bits
+
+        sourceGen.io.gen  := vxCacheToL2A.ready && vxCacheToL2A.valid
+        sourceGen.io.meta := vxCache.io.mem_req_tag //save the old read id
 
         vxCacheToL2A.bits.param   := 0.U
         vxCacheToL2A.bits.size    := 4.U
@@ -257,10 +324,11 @@ class VortexFatBankImp (
         vxCacheToL2D.ready := vxCache.io.mem_rsp_ready
 
         vxCache.io.mem_rsp_valid := vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAckData
-        vxCache.io.mem_rsp_tag   := vxCacheToL2D.bits.source - 16.U // -16 for read resp, we can safely do this, since write-ack wouldn't pass through
-        vxCache.io.mem_rsp_data := vxCacheToL2D.bits.data
+        vxCache.io.mem_rsp_tag   := sourceGen.io.peek
+        vxCache.io.mem_rsp_data  := vxCacheToL2D.bits.data
 
-        sourceGen.io.reclaim.valid := vxCacheToL2D.ready && vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAck
+        // all ids needs to be reclaimed
+        sourceGen.io.reclaim.valid := vxCacheToL2D.ready && vxCacheToL2D.valid
         sourceGen.io.reclaim.bits := vxCacheToL2D.bits.source
 
     }