From 127d7613e1414e374b525188226a1b6bb5de7c32 Mon Sep 17 00:00:00 2001 From: joshua Date: Mon, 9 Oct 2023 14:49:57 -0700 Subject: [PATCH 1/3] add vortex fat bank + test (not compiling atm) --- src/main/resources/vsrc/vortex | 2 +- src/main/scala/rocket/VortexFatBank.scala | 273 ++++++++++++++++++++++ 2 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 src/main/scala/rocket/VortexFatBank.scala diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 1081f8a..0ddf415 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 1081f8a485565d3a6db0cfa416a5bd9869185f1f +Subproject commit 0ddf4152ee0d4d69e5bd42213f5e51bda4e74ac1 diff --git a/src/main/scala/rocket/VortexFatBank.scala b/src/main/scala/rocket/VortexFatBank.scala new file mode 100644 index 0000000..c1b6709 --- /dev/null +++ b/src/main/scala/rocket/VortexFatBank.scala @@ -0,0 +1,273 @@ +package rocket + +import chisel3._ +import chisel3.util._ +import chisel3.experimental._ +import org.chipsalliance.cde.config.Parameters +import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.tilelink._ + +// VortexTile has dmemNodes, imemNodes + +class VortexFatBank ( + +) (implicit p: Parameters) extends LazyModule { + + val clientParam = Seq(TLMasterPortParameters.v1( + clients = Seq( + TLMasterParameters.v1( + name = "VortexFatBank", + sourceId = IdRange(0, 1 << 14), // FIXME: magic number + supportsProbe = TransferSizes(1, 16), + supportsGet = TransferSizes(1, 16), + supportsPutFull = TransferSizes(1, 16), + supportsPutPartial = TransferSizes(1, 16) + ) + ) + )) + + val managerParam = Seq(TLSlavePortParameters.v1( + beatBytes = 16, + managers = Seq( + TLSlaveParameters.v1( + address = Seq(AddressSet(0x0, 0xffffff)), // intercept all requests (it does not like all F for some reason) + regionType = RegionType.IDEMPOTENT, // idk what this does + executable = false, + supportsGet = TransferSizes(1, 16), + supportsPutPartial = TransferSizes(1, 16), + supportsPutFull = TransferSizes(1, 16), + fifoId = Some(0) + ) + ) + )) + + val coalToBankNode = TLManagerNode(managerParam) + val bankToL2Node = TLClientNode(clientParam) + lazy val module = new VortexFatBankImp(this); +} + +class VortexFatBankImp ( + outer: VortexFatBank +) extends LazyModuleImp(outer) { + val bank = Module(new VX_Cache()); + + bank.io.clk := clock + bank.io.reset := reset + + // Translate TL request from Coalescer to requests for VX_Cache + def TLReq2VXReq = { + val (coalToBankBundle, _) = outer.coalToBankNode.in.head + // coal -> bank request on channel A + val coalToBankA = coalToBankBundle.a; + + coalToBankA.ready := bank.io.core_req_ready + bank.io.core_req_valid := coalToBankA.valid + + // read = 0, write = 1 + bank.io.core_req_rw := !(coalToBankA.bits.opcode === TLMessages.Get) + bank.io.core_req_addr := coalToBankA.bits.address(31, 4) + bank.io.core_req_byteen := coalToBankA.bits.mask + bank.io.core_req_data := coalToBankA.bits.data + bank.io.core_req_tag := coalToBankA.bits.source + + // we ignore param, size, corrupt fields + + // bank -> coal response on channel D + val coalToBankD = coalToBankBundle.d; + + bank.io.core_rsp_ready := coalToBankD.ready + coalToBankD.valid := bank.io.core_rsp_valid + + // Cache is not TL compliant since we don't generate AccessAcks on successful writes + // but VortexCore is not expecting them anyways + coalToBankD.bits.opcode := TLMessages.AccessAckData + coalToBankD.bits.param := 0.U + coalToBankD.bits.size := 4.U + coalToBankD.bits.sink := 0.U + coalToBankD.bits.denied := false.B + coalToBankD.bits.corrupt := false.B + + coalToBankD.bits.data := bank.io.core_rsp_data + coalToBankD.bits.source := bank.io.core_rsp_tag + } + + // Translate VX_Cache mem request to a TL request to be sent to L2 + def VXReq2TLReq = { + val (bankToL2Bundle, _) = outer.bankToL2Node.out.head + // bank -> L2 request on channel A + val bankToL2A = bankToL2Bundle.a; + + + bank.io.mem_req_ready := bankToL2A.ready + bankToL2A.valid := bank.io.mem_req_valid + + bankToL2A.bits.opcode := Mux( + bank.io.mem_req_rw, + Mux(bank.io.mem_req_byteen.andR, TLMessages.PutFullData, TLMessages.PutPartialData), + TLMessages.Get + ) + bankToL2A.bits.address := Cat(bank.io.mem_req_addr, 0.U(4.W)) + bankToL2A.bits.mask := bank.io.mem_req_byteen + bankToL2A.bits.data := bank.io.mem_req_data + bankToL2A.bits.source := bank.io.mem_req_tag + + bankToL2A.bits.param := 0.U + bankToL2A.bits.size := 4.U + bankToL2A.bits.corrupt := false.B + + // we ignore param, size, corrupt fields + + // L2 -> bank response on channel D + val bankToL2D = bankToL2Bundle.d; + + bankToL2D.ready := bank.io.mem_rsp_ready + bank.io.mem_rsp_valid := (bankToL2D.valid && (bankToL2D.bits.opcode === TLMessages.AccessAckData)) // need to ignore AccessAcks + + bank.io.mem_rsp_tag := bankToL2D.bits.source + bank.io.mem_rsp_data := bankToL2D.bits.data + + } + + TLReq2VXReq + VXReq2TLReq + +} + +class VX_Cache ( + CACHE_ID: Int = 0, + CACHE_SIZE: Int = 16384, + CACHE_LINE_SIZE: Int = 16, + NUM_PORTS: Int = 1, + WORD_SIZE: Int = 16, // hack - one "word" is enough to satisfy all 4 warps after decoalescing. + CREQ_SIZE: Int = 0, + CRSQ_SIZE: Int = 2, + MSHR_SIZE: Int = 8, + MRSQ_SIZE: Int = 0, + MREQ_SIZE: Int = 4, + WRITE_ENABLE: Int = 1, + CORE_TAG_WIDTH: Int = 10, // source ID ranges from 0 to 1 << 10 + CORE_TAG_ID_BITS: Int = 5, // no idea what this is, just match it with default L1 dcache + BANK_ADDR_OFFSET: Int = 0, + NC_ENABLE: Int = 1, // Unsure what this does, but it's elaborated as 1 in default L1 setup so hopefully this is ok + WORD_ADDR_WIDTH: Int = 28, // 16 byte "word" = 4 bits + MEM_TAG_WIDTH: Int = 14, // Elaborated value is also completely different from (32 - log2Ceil(CACHE_LINE_SIZE)). This should match with sourceIds on client node associated with this cache + MEM_ADDR_WIDTH: Int = 28 // 16 byte cache line = 4 bits +) extends BlackBox ( + Map( + "CACHE_ID" -> CACHE_ID, + "NUM_REQS" -> 1, //Force NUM_REQS to be 1, we use their Cache as our individual Bank + "CACHE_SIZE" -> CACHE_SIZE, + "CACHE_LINE_SIZE" -> CACHE_LINE_SIZE, + "NUM_PORTS" -> NUM_PORTS, + "WORD_SIZE" -> WORD_SIZE, + "CREQ_SIZE" -> CREQ_SIZE, + "CRSQ_SIZE" -> CRSQ_SIZE, + "MSHR_SIZE" -> MSHR_SIZE, + "MRSQ_SIZE" -> MRSQ_SIZE, + "MREQ_SIZE" -> MREQ_SIZE, + "WRITE_ENABLE" -> WRITE_ENABLE, + "CORE_TAG_WIDTH" -> CORE_TAG_WIDTH, + "CORE_TAG_ID_BITS" -> CORE_TAG_ID_BITS, + "MEM_TAG_WIDTH" -> MEM_TAG_WIDTH, + "BANK_ADDR_OFFSET" -> BANK_ADDR_OFFSET, + "NC_ENABLE" -> NC_ENABLE, + ) +) with HasBlackBoxResource { + + val io = IO(new Bundle { + val clk = Input(Clock()) + val reset = Input(Reset()) + + // We should be able to turn the following into TileLink easily + + // CACHE <> CORE + val core_req_valid = Input(Bool()) + val core_req_rw = Input(Bool()) + val core_req_addr = Input(UInt(WORD_ADDR_WIDTH.W)) + val core_req_byteen = Input(UInt(WORD_SIZE.W)) + val core_req_data = Input(UInt((WORD_SIZE * 8).W)) + val core_req_tag = Input(UInt(CORE_TAG_WIDTH.W)) + val core_req_ready = Output(Bool()) + + val core_rsp_valid = Output(Bool()) // 1 bit wide + val core_rsp_tmask = Output(Bool()) // 1 bit wide, probably can ignore (check waveform) + val core_rsp_data = Output(UInt((WORD_SIZE * 8).W)) + val core_rsp_tag = Output(UInt(CORE_TAG_WIDTH.W)) + val core_rsp_ready = Input(Bool()) + + // CACHE <> L2 + val mem_req_valid = Output(Bool()) + val mem_req_rw = Output(Bool()) + val mem_req_byteen = Output(UInt(CACHE_LINE_SIZE.W)) + val mem_req_addr = Output(UInt(MEM_ADDR_WIDTH.W)) + val mem_req_data = Output(UInt((CACHE_LINE_SIZE * 8).W)) + val mem_req_tag = Output(UInt(MEM_TAG_WIDTH.W)) + val mem_req_ready = Input(Bool()) + + val mem_rsp_valid = Input(Bool()) + val mem_rsp_data = Input(UInt((CACHE_LINE_SIZE * 8).W)) + val mem_rsp_tag = Input(UInt(MEM_TAG_WIDTH.W)) + val mem_rsp_ready = Output(Bool()) + }) + + addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_define.vh") + // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_mux.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_lzc.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_fifo_queue.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_scan.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_find_first.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_multiplier.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_bits_remove.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_register.sv") + // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_mux.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_encoder.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_reset_relay.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_popcount.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_bits_insert.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_skid_buffer.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_fixed_arbiter.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_shift_register.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_index_buffer.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_encoder.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_matrix_arbiter.sv") + // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_divider.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_dp_ram.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_elastic_buffer.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_rr_arbiter.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_arbiter.sv") + // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_bypass_buffer.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_sp_ram.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_demux.sv") + + // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_index_queue.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_serial_div.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_fair_arbiter.sv") + + addResource("/vsrc/vortex/hw/rtl/VX_define.vh") + addResource("/vsrc/vortex/hw/VX_config.h") + + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_icache_rsp_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_dcache_req_if.sv") + + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_cache_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_memsys_if.sv") + + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_dcache_rsp_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_icache_req_if.sv") + + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_mem_rsp_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_mem_req_if.sv") + + addResource("/vsrc/vortex/hw/rtl/cache/VX_shared_mem.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_core_rsp_merge.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_tag_access.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_core_req_bank_sel.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_bank.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_data_access.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_flush_ctrl.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_nc_bypass.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_miss_resrv.sv") + addResource("/vsrc/vortex/hw/rtl/cache/VX_cache.sv") + +} From e50903ed42833aebc8aa967f54a8d91d4cd100ef Mon Sep 17 00:00:00 2001 From: Vamber Yang Date: Mon, 16 Oct 2023 10:31:31 -0700 Subject: [PATCH 2/3] VX_FatBank runs in SoC Config with Coalescer till termination Issues addressed: 1. FatBank ack to downstream coalescer with the correct size on ChannelD 2. FatBank ack to downstream coalescer immediately after W Req 3. FatBank generates unique ID for W Req to L2 4. Allows coalescer to config max Coal to L1 ReadSize at compile time Ungoing issues: 1. Magic Number 2. Verification 3. Multi-Bank Integration --- src/main/scala/rocket/VortexFatBank.scala | 359 ++++++++++++++---- .../scala/tilelink/CanHaveMemtraceCore.scala | 35 +- src/main/scala/tilelink/Coalescing.scala | 1 + 3 files changed, 313 insertions(+), 82 deletions(-) diff --git a/src/main/scala/rocket/VortexFatBank.scala b/src/main/scala/rocket/VortexFatBank.scala index c1b6709..9481af8 100644 --- a/src/main/scala/rocket/VortexFatBank.scala +++ b/src/main/scala/rocket/VortexFatBank.scala @@ -1,4 +1,5 @@ -package rocket +//package freechips.rocketchip.rocket +package freechips.rocketchip.tilelink import chisel3._ import chisel3.util._ @@ -6,125 +7,261 @@ import chisel3.experimental._ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.diplomacy._ import freechips.rocketchip.tilelink._ +import org.chipsalliance.cde.config.{Parameters, Field} + // VortexTile has dmemNodes, imemNodes -class VortexFatBank ( +//Param and Key are used during SoC Generation -) (implicit p: Parameters) extends LazyModule { +case class VortexFatBankParam(wordSize: Int = 16, busWidthInBytes: Int = 8) +case object VortexFatBankKey extends Field[Option[VortexFatBankConfig]](None /*default*/) + +case class VortexFatBankConfig( + wordSize: Int, //This is the read/write granularity of the L1 cache + cacheLineSize: Int, + coreTagWidth: Int, +) { + def coreTagPlusSizeWidth: Int = { + log2Ceil(wordSize) + coreTagWidth + } +} + +object defaultFatBankConfig extends VortexFatBankConfig( + wordSize = 16, + cacheLineSize = 16, + coreTagWidth = 8, +) + + +class VortexFatBank (config: VortexFatBankConfig) (implicit p: Parameters) extends LazyModule { val clientParam = Seq(TLMasterPortParameters.v1( clients = Seq( TLMasterParameters.v1( name = "VortexFatBank", sourceId = IdRange(0, 1 << 14), // FIXME: magic number - supportsProbe = TransferSizes(1, 16), - supportsGet = TransferSizes(1, 16), - supportsPutFull = TransferSizes(1, 16), - supportsPutPartial = TransferSizes(1, 16) + supportsProbe = TransferSizes(1, config.wordSize), + supportsGet = TransferSizes(1, config.wordSize), + supportsPutFull = TransferSizes(1, config.wordSize), + supportsPutPartial = TransferSizes(1, config.wordSize) ) ) )) val managerParam = Seq(TLSlavePortParameters.v1( - beatBytes = 16, + beatBytes = config.wordSize, managers = Seq( TLSlaveParameters.v1( - address = Seq(AddressSet(0x0, 0xffffff)), // intercept all requests (it does not like all F for some reason) + address = Seq(AddressSet(0x80000000L, 0xfffffff)), // 0x80000000 -> 0x90000000 are possible address tracer can emit regionType = RegionType.IDEMPOTENT, // idk what this does executable = false, - supportsGet = TransferSizes(1, 16), - supportsPutPartial = TransferSizes(1, 16), - supportsPutFull = TransferSizes(1, 16), + supportsGet = TransferSizes(1, config.wordSize), + supportsPutPartial = TransferSizes(1, config.wordSize), + supportsPutFull = TransferSizes(1, config.wordSize), fifoId = Some(0) ) ) )) - val coalToBankNode = TLManagerNode(managerParam) - val bankToL2Node = TLClientNode(clientParam) - lazy val module = new VortexFatBankImp(this); + val coalToVxCacheNode = TLManagerNode(managerParam) + val vxCacheToL2Node = TLIdentityNode() + val vxCacheFetchNode = TLClientNode(clientParam) + + //We need this widthWidget here, because whenever the fatBank is performing + //read and write to Mem, it must have the illusion that dataWidth is as big as + //as its cacheline size + vxCacheToL2Node := TLWidthWidget(config.cacheLineSize) := vxCacheFetchNode + lazy val module = new VortexFatBankImp(this, config); } class VortexFatBankImp ( - outer: VortexFatBank + outer: VortexFatBank, + config: VortexFatBankConfig ) extends LazyModuleImp(outer) { - val bank = Module(new VX_Cache()); - bank.io.clk := clock - bank.io.reset := reset + val vxCache = Module(new VX_cache( + WORD_SIZE=config.wordSize, + CACHE_LINE_SIZE=config.cacheLineSize, + CORE_TAG_WIDTH= config.coreTagPlusSizeWidth + ) + ); - // Translate TL request from Coalescer to requests for VX_Cache + vxCache.io.clk := clock + vxCache.io.reset := reset + + class WriteReqInfo extends Bundle { + val id = UInt(32.W) + val size = UInt(32.W) + } + + + // assuming this is never full + val rcvWriteReqInfo = Module(new Queue((new WriteReqInfo).cloneType, 64, true, false)) + + class ReadReqInfo(config: VortexFatBankConfig) extends Bundle { + val size = UInt(log2Ceil(config.wordSize).W) + val id = UInt(config.coreTagWidth.W) + } + + val readReqInfo = Wire(new ReadReqInfo(config)) + + // Translate TL request from Coalescer to requests for VX_cache def TLReq2VXReq = { - val (coalToBankBundle, _) = outer.coalToBankNode.in.head - // coal -> bank request on channel A + val (coalToBankBundle, _) = outer.coalToVxCacheNode.in.head + // coal -> vxCache request on channel A val coalToBankA = coalToBankBundle.a; - coalToBankA.ready := bank.io.core_req_ready - bank.io.core_req_valid := coalToBankA.valid + coalToBankA.ready := vxCache.io.core_req_ready + vxCache.io.core_req_valid := coalToBankA.valid // read = 0, write = 1 - bank.io.core_req_rw := !(coalToBankA.bits.opcode === TLMessages.Get) - bank.io.core_req_addr := coalToBankA.bits.address(31, 4) - bank.io.core_req_byteen := coalToBankA.bits.mask - bank.io.core_req_data := coalToBankA.bits.data - bank.io.core_req_tag := coalToBankA.bits.source - + vxCache.io.core_req_rw := !(coalToBankA.bits.opcode === TLMessages.Get) + //4 is also hardcoded, it should be log2WordSize + vxCache.io.core_req_addr := coalToBankA.bits.address(31, 4) + vxCache.io.core_req_byteen := coalToBankA.bits.mask + vxCache.io.core_req_data := coalToBankA.bits.data + + readReqInfo.id := coalToBankA.bits.source + readReqInfo.size := coalToBankA.bits.size + vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag) + + // we ignore param, size, corrupt fields - // bank -> coal response on channel D + // vxCache -> coal response on channel D + // ok ... this part is a little tricky, the downstream coalescer requires the L1 cache + // to send ack and dataAck, this is how coalescer knows when an inflight ID has retired + // if we don't send ack, the coalescer will run out of IDs, and can't generate new request + + // for read request, we send AckData when the FatBank has a valid output + // for write request, we can immediate Ack on the next clock cycle (not the same clock cycle, otherwise critical path too long) + // It's possible that on the same cycle, we need to do both "AckData" and "Ack" + // in this case, we always priorize "Ack", this makes the design easier + + //I think this just shows the flaws of Tilelink. CPU never waits for an Ack upon regular write request + //the Core should unconditionally move forward after every regular write request + val coalToBankD = coalToBankBundle.d; - bank.io.core_rsp_ready := coalToBankD.ready - coalToBankD.valid := bank.io.core_rsp_valid - // Cache is not TL compliant since we don't generate AccessAcks on successful writes - // but VortexCore is not expecting them anyways - coalToBankD.bits.opcode := TLMessages.AccessAckData + // currently assuming below buffer is never full + rcvWriteReqInfo.io.enq.valid := !(coalToBankA.bits.opcode === TLMessages.Get) && coalToBankA.valid && coalToBankA.ready + rcvWriteReqInfo.io.enq.bits.id := coalToBankA.bits.source + rcvWriteReqInfo.io.enq.bits.size := coalToBankA.bits.size + + + rcvWriteReqInfo.io.deq.ready := coalToBankD.ready + + //if we "need" to do Ack + //we unconditionally set the vxCache.ready to be false, so it gets delayed + vxCache.io.core_rsp_ready := Mux( + rcvWriteReqInfo.io.deq.valid, + false.B, + coalToBankD.ready + ) + + coalToBankD.valid := Mux( + rcvWriteReqInfo.io.deq.valid, + true.B, + vxCache.io.core_rsp_valid + ) + + coalToBankD.bits.source := Mux( + rcvWriteReqInfo.io.deq.valid, + rcvWriteReqInfo.io.deq.bits.id, + vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id + ) + + coalToBankD.bits.opcode := Mux( + rcvWriteReqInfo.io.deq.valid, + TLMessages.AccessAck, + TLMessages.AccessAckData + ) + + coalToBankD.bits.size := Mux( + rcvWriteReqInfo.io.deq.valid, + rcvWriteReqInfo.io.deq.bits.size, + vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size + ) + coalToBankD.bits.param := 0.U - coalToBankD.bits.size := 4.U coalToBankD.bits.sink := 0.U coalToBankD.bits.denied := false.B coalToBankD.bits.corrupt := false.B - coalToBankD.bits.data := bank.io.core_rsp_data - coalToBankD.bits.source := bank.io.core_rsp_tag + coalToBankD.bits.data := vxCache.io.core_rsp_data } - // Translate VX_Cache mem request to a TL request to be sent to L2 + + //Using Hansung's Source Generator + //Why do we need to do this, what is the issue ? + //Tilelink requires all inflight Read and Write Request to have a unique source_ID + //vx_cache can indeed guarantee that all active read operation has unique ID + //However, since the cache is write_through, so it can't ensure unique ID for write operation + //Therefore, we need our own internal source_ID generator for all write operation + // + //Now, we allocate id range: 0-15 for all write operation + // and 16-> above for read operation + val sourceGen = Module( new SourceGenerator(log2Ceil(16), ignoreInUse = false)) + + + // Translate VX_cache mem request to a TL request to be sent to L2 def VXReq2TLReq = { - val (bankToL2Bundle, _) = outer.bankToL2Node.out.head - // bank -> L2 request on channel A - val bankToL2A = bankToL2Bundle.a; + val (vxCacheToL2Bundle, _) = outer.vxCacheFetchNode.out.head + // vxCache -> L2 request on channel A + val vxCacheToL2A = vxCacheToL2Bundle.a; - bank.io.mem_req_ready := bankToL2A.ready - bankToL2A.valid := bank.io.mem_req_valid + //Read Operation is ready as long as downstream L2 is ready - bankToL2A.bits.opcode := Mux( - bank.io.mem_req_rw, - Mux(bank.io.mem_req_byteen.andR, TLMessages.PutFullData, TLMessages.PutPartialData), + vxCache.io.mem_req_ready := vxCacheToL2A.ready + + vxCacheToL2A.valid := Mux( + vxCache.io.mem_req_rw, + vxCache.io.mem_req_valid && sourceGen.io.id.valid, + vxCache.io.mem_req_valid + ) + + vxCacheToL2A.bits.opcode := Mux( + vxCache.io.mem_req_rw, + Mux(vxCache.io.mem_req_byteen.andR, TLMessages.PutFullData, TLMessages.PutPartialData), TLMessages.Get ) - bankToL2A.bits.address := Cat(bank.io.mem_req_addr, 0.U(4.W)) - bankToL2A.bits.mask := bank.io.mem_req_byteen - bankToL2A.bits.data := bank.io.mem_req_data - bankToL2A.bits.source := bank.io.mem_req_tag - bankToL2A.bits.param := 0.U - bankToL2A.bits.size := 4.U - bankToL2A.bits.corrupt := false.B + vxCacheToL2A.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W)) + vxCacheToL2A.bits.mask := Mux( + vxCache.io.mem_req_rw, + vxCache.io.mem_req_byteen, + 0xFFFF.U + ) + vxCacheToL2A.bits.data := vxCache.io.mem_req_data + + + vxCacheToL2A.bits.source := Mux( + vxCache.io.mem_req_rw, + sourceGen.io.id.bits, + vxCache.io.mem_req_tag + 16.U + ) + //mark current source_id as in-use + sourceGen.io.gen := vxCache.io.mem_req_rw && vxCacheToL2A.ready && vxCacheToL2A.valid + + vxCacheToL2A.bits.param := 0.U + vxCacheToL2A.bits.size := 4.U + vxCacheToL2A.bits.corrupt := false.B // we ignore param, size, corrupt fields - // L2 -> bank response on channel D - val bankToL2D = bankToL2Bundle.d; + // L2 -> vxCache response on channel D + val vxCacheToL2D = vxCacheToL2Bundle.d; + vxCacheToL2D.ready := vxCache.io.mem_rsp_ready - bankToL2D.ready := bank.io.mem_rsp_ready - bank.io.mem_rsp_valid := (bankToL2D.valid && (bankToL2D.bits.opcode === TLMessages.AccessAckData)) // need to ignore AccessAcks + vxCache.io.mem_rsp_valid := vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAckData + vxCache.io.mem_rsp_tag := vxCacheToL2D.bits.source - 16.U // -16 for read resp, we can safely do this, since write-ack wouldn't pass through + vxCache.io.mem_rsp_data := vxCacheToL2D.bits.data - bank.io.mem_rsp_tag := bankToL2D.bits.source - bank.io.mem_rsp_data := bankToL2D.bits.data + sourceGen.io.reclaim.valid := vxCacheToL2D.ready && vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAck + sourceGen.io.reclaim.bits := vxCacheToL2D.bits.source } @@ -133,7 +270,7 @@ class VortexFatBankImp ( } -class VX_Cache ( +class VX_cache ( CACHE_ID: Int = 0, CACHE_SIZE: Int = 16384, CACHE_LINE_SIZE: Int = 16, @@ -145,10 +282,10 @@ class VX_Cache ( MRSQ_SIZE: Int = 0, MREQ_SIZE: Int = 4, WRITE_ENABLE: Int = 1, - CORE_TAG_WIDTH: Int = 10, // source ID ranges from 0 to 1 << 10 + CORE_TAG_WIDTH: Int = 10, // source ID ranges from 0 to 1 << 10, we need to allocate upper bits to save size CORE_TAG_ID_BITS: Int = 5, // no idea what this is, just match it with default L1 dcache BANK_ADDR_OFFSET: Int = 0, - NC_ENABLE: Int = 1, // Unsure what this does, but it's elaborated as 1 in default L1 setup so hopefully this is ok + NC_ENABLE: Int = 0, //NC_ENABLE=1 means the cache becomes a passthrough WORD_ADDR_WIDTH: Int = 28, // 16 byte "word" = 4 bits MEM_TAG_WIDTH: Int = 14, // Elaborated value is also completely different from (32 - log2Ceil(CACHE_LINE_SIZE)). This should match with sourceIds on client node associated with this cache MEM_ADDR_WIDTH: Int = 28 // 16 byte cache line = 4 bits @@ -210,8 +347,33 @@ class VX_Cache ( val mem_rsp_ready = Output(Bool()) }) + + addResource("/vsrc/vortex/hw/rtl/VX_dispatch.sv") + addResource("/vsrc/vortex/hw/rtl/VX_issue.sv") addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_define.vh") - // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_mux.sv") + addResource("/vsrc/vortex/hw/rtl/VX_warp_sched.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_sat.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_stride.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_lerp.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_addr.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_mem.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_format.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_sampler.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_unit.sv") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_define.vh") + addResource("/vsrc/vortex/hw/rtl/tex_unit/VX_tex_wrap.sv") + addResource("/vsrc/vortex/hw/rtl/VX_scope.vh") + addResource("/vsrc/vortex/hw/rtl/VX_fpu_unit.sv") + addResource("/vsrc/vortex/hw/rtl/VX_scoreboard.sv") + addResource("/vsrc/vortex/hw/rtl/VX_writeback.sv") + addResource("/vsrc/vortex/hw/rtl/VX_muldiv.sv") + addResource("/vsrc/vortex/hw/rtl/VX_decode.sv") + addResource("/vsrc/vortex/hw/rtl/VX_ibuffer.sv") + addResource("/vsrc/vortex/hw/rtl/VX_icache_stage.sv") + addResource("/vsrc/vortex/hw/rtl/VX_gpu_unit.sv") + addResource("/vsrc/vortex/hw/rtl/VX_trace_instr.vh") + addResource("/vsrc/vortex/hw/rtl/VX_gpu_types.vh") + addResource("/vsrc/vortex/hw/rtl/VX_config.vh") addResource("/vsrc/vortex/hw/rtl/libs/VX_lzc.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_fifo_queue.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_scan.sv") @@ -219,7 +381,6 @@ class VX_Cache ( addResource("/vsrc/vortex/hw/rtl/libs/VX_multiplier.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_bits_remove.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_pipe_register.sv") - // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_mux.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_priority_encoder.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_reset_relay.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_popcount.sv") @@ -230,36 +391,80 @@ class VX_Cache ( addResource("/vsrc/vortex/hw/rtl/libs/VX_index_buffer.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_onehot_encoder.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_matrix_arbiter.sv") - // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_divider.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_dp_ram.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_elastic_buffer.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_rr_arbiter.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_arbiter.sv") - // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_bypass_buffer.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_sp_ram.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_stream_demux.sv") - - // unused addResource("/vsrc/vortex/hw/rtl/libs/VX_index_queue.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_serial_div.sv") addResource("/vsrc/vortex/hw/rtl/libs/VX_fair_arbiter.sv") - + addResource("/vsrc/vortex/hw/rtl/libs/VX_pending_size.sv") addResource("/vsrc/vortex/hw/rtl/VX_define.vh") + addResource("/vsrc/vortex/hw/rtl/VX_csr_data.sv") + addResource("/vsrc/vortex/hw/rtl/VX_cache_arb.sv") + addResource("/vsrc/vortex/hw/rtl/VX_ipdom_stack.sv") + addResource("/vsrc/vortex/hw/rtl/VX_gpr_stage.sv") + addResource("/vsrc/vortex/hw/rtl/VX_execute.sv") + addResource("/vsrc/vortex/hw/rtl/VX_fetch.sv") + addResource("/vsrc/vortex/hw/rtl/VX_alu_unit.sv") + addResource("/vsrc/vortex/hw/rtl/VX_platform.vh") + addResource("/vsrc/vortex/hw/rtl/VX_commit.sv") + addResource("/vsrc/vortex/hw/rtl/VX_pipeline.sv") + addResource("/vsrc/vortex/hw/rtl/VX_lsu_unit.sv") + addResource("/vsrc/vortex/hw/rtl/VX_csr_unit.sv") addResource("/vsrc/vortex/hw/VX_config.h") - + addResource("/vsrc/vortex/sim/common/rvfloats.h") + addResource("/vsrc/vortex/sim/common/rvfloats.cpp") + addResource("/csrc/softfloat/include/internals.h") + addResource("/csrc/softfloat/include/primitives.h") + addResource("/csrc/softfloat/include/primitiveTypes.h") + addResource("/csrc/softfloat/include/softfloat.h") + addResource("/csrc/softfloat/include/softfloat_types.h") + addResource("/csrc/softfloat/RISCV/specialize.h") + addResource("/vsrc/vortex/hw/dpi/float_dpi.cpp") + addResource("/vsrc/vortex/hw/dpi/float_dpi.vh") + addResource("/vsrc/vortex/hw/dpi/util_dpi.cpp") + addResource("/vsrc/vortex/hw/dpi/util_dpi.vh") + addResource("/vsrc/vortex/hw/rtl/fp_cores/VX_fpu_dpi.sv") + addResource("/vsrc/vortex/hw/rtl/fp_cores/VX_fpu_define.vh") + addResource("/vsrc/vortex/hw/rtl/fp_cores/VX_fpu_types.vh") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_icache_rsp_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_dcache_req_if.sv") - + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_tex_csr_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_join_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_ifetch_req_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_cache_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_memsys_if.sv") - + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_gpr_req_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_decode_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_writeback_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_gpu_req_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_pipeline_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_gpr_rsp_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_cmt_to_csr_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_csr_to_alu_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_ifetch_rsp_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_alu_req_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_csr_req_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_ibuffer_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_branch_ctl_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_dcache_rsp_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_icache_req_if.sv") - + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_lsu_req_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_wstall_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_mem_rsp_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_fpu_to_csr_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_commit_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_tex_req_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_warp_ctl_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_tex_rsp_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_fetch_to_csr_if.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_perf_tex_if.sv") addResource("/vsrc/vortex/hw/rtl/interfaces/VX_mem_req_if.sv") - - addResource("/vsrc/vortex/hw/rtl/cache/VX_shared_mem.sv") + addResource("/vsrc/vortex/hw/rtl/interfaces/VX_fpu_req_if.sv") + //addResource("/vsrc/vortex/hw/rtl/cache/VX_shared_mem.sv") addResource("/vsrc/vortex/hw/rtl/cache/VX_core_rsp_merge.sv") addResource("/vsrc/vortex/hw/rtl/cache/VX_tag_access.sv") addResource("/vsrc/vortex/hw/rtl/cache/VX_core_req_bank_sel.sv") diff --git a/src/main/scala/tilelink/CanHaveMemtraceCore.scala b/src/main/scala/tilelink/CanHaveMemtraceCore.scala index 0bcbd70..51e45c6 100644 --- a/src/main/scala/tilelink/CanHaveMemtraceCore.scala +++ b/src/main/scala/tilelink/CanHaveMemtraceCore.scala @@ -3,6 +3,8 @@ package freechips.rocketchip.tilelink import freechips.rocketchip.diplomacy.LazyModule import freechips.rocketchip.subsystem.BaseSubsystem import org.chipsalliance.cde.config.Parameters +import freechips.rocketchip.rocket + // TODO: possibly move to somewhere closer to CoalescingUnit // TODO: separate coalescer config from CanHaveMemtraceCore @@ -44,16 +46,39 @@ trait CanHaveMemtraceCore { this: BaseSubsystem => } case None => tracer.node } - val upstream = p(CoalXbarKey) match { + val coalXbar = p(CoalXbarKey) match { case Some(xbarParam) =>{ - val priorityXbar = LazyModule(new CoalescerTLPriortyXBar) - println(s"============ Using Priority XBar for Coalescer Requests ") - priorityXbar.node :=* coalescerNode - priorityXbar.node + val coXbar = LazyModule(new TLXbar) + println(s"============ Using TLXBar for Coalescer Requests ") + coXbar.node :=* coalescerNode + coXbar.node } case None => coalescerNode } + + val vortexBank = p(VortexFatBankKey) match { + case Some(fatBankParam) =>{ + val vx_fatbank = LazyModule(new VortexFatBank(fatBankParam)) + println(s"============ Using Vortex FatBank as L1 ") + vx_fatbank.coalToVxCacheNode :=* coalXbar + vx_fatbank.vxCacheToL2Node + } + case None => coalXbar + } + + //If there is only 1 bank, the code below is useless + val upstream = p(CoalXbarKey) match { + case Some(xbarParam) =>{ + val tileXbar = LazyModule(new TLXbar) + println(s"============ Using TLXBar for L1 Requests ") + tileXbar.node :=* vortexBank + tileXbar.node + } + case None => vortexBank + } + + sbus.coupleFrom(s"gpu-tracer") { _ :=* upstream } } } diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 1b24c20..217049a 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -774,6 +774,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) println(s" numNewSrcIds: ${config.numNewSrcIds}") println(s" reqQueueDepth: ${config.queueDepth}") println(s" respQueueDepth: ${config.respQueueDepth}") + println(s" addressWidth: ${config.addressWidth}") println(s"}") require( From 60a63d4e1116c8c22715b54b74621e3fcc2611c3 Mon Sep 17 00:00:00 2001 From: Vamber Yang Date: Sun, 22 Oct 2023 17:03:37 -0700 Subject: [PATCH 3/3] FatBank Integration Improvements: 1. ensure FatBank prioritze Ack read over Ack write to downstream coalescer 2. Between FatBank and L2, use the new sourceGenerator to allow both Read and Write Reqs sharing the same pool of available src_ids --- src/main/scala/rocket/VortexFatBank.scala | 180 +++++++++++++++------- 1 file changed, 124 insertions(+), 56 deletions(-) diff --git a/src/main/scala/rocket/VortexFatBank.scala b/src/main/scala/rocket/VortexFatBank.scala index 9481af8..dddf348 100644 --- a/src/main/scala/rocket/VortexFatBank.scala +++ b/src/main/scala/rocket/VortexFatBank.scala @@ -10,6 +10,88 @@ import freechips.rocketchip.tilelink._ import org.chipsalliance.cde.config.{Parameters, Field} + + +// Delete the following NewSourceGenerator when merging with origin/graphics +//we should just use the one in coalescing.scala written by hansung + +class NewSourceGenerator[T <: Data]( + sourceWidth: Int, + metadata: Option[T] = None, + ignoreInUse: Boolean = false +) extends Module { + def getMetadataType = metadata match { + case Some(gen) => gen.cloneType + case None => UInt(0.W) + } + val io = IO(new Bundle { + val gen = Input(Bool()) + val reclaim = Input(Valid(UInt(sourceWidth.W))) + val id = Output(Valid(UInt(sourceWidth.W))) + // below are used only when metadata is not None + // `meta` is used as input when a request succeeds id generation to store + // its value to the table. + // `peek` is the retrieved metadata saved for the request when corresponding + // request has come back, setting `reclaim`. + // Although these do not use ValidIO, it is safe because any in-flight + // response coming back should have allocated a valid entry in the table + // when it went out. + val meta = Input(getMetadataType) + val peek = Output(getMetadataType) + // for debugging; indicates whether there is at least one inflight request + // that hasn't been reclaimed yet + val inflight = Output(Bool()) + }) + val head = RegInit(UInt(sourceWidth.W), 0.U) + head := Mux(io.gen, head + 1.U, head) + + val outstanding = RegInit(UInt((sourceWidth + 1).W), 0.U) + io.inflight := (outstanding > 0.U) || io.gen + + val numSourceId = 1 << sourceWidth + val row = new Bundle { + val meta = getMetadataType + val id = Valid(UInt(sourceWidth.W)) + } + // valid: in use, invalid: available + // val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W))) + val occupancyTable = Mem(numSourceId, row) + when(reset.asBool) { + (0 until numSourceId).foreach { occupancyTable(_).id.valid := false.B } + } + val frees = (0 until numSourceId).map(!occupancyTable(_).id.valid) + val lowestFree = PriorityEncoder(frees) + val lowestFreeRow = occupancyTable(lowestFree) + + io.id.valid := (if (ignoreInUse) true.B else !lowestFreeRow.id.valid) + io.id.bits := lowestFree + when(io.gen && io.id.valid /* fire */ ) { + occupancyTable(io.id.bits).id.valid := true.B // mark in use + if (metadata.isDefined) { + occupancyTable(io.id.bits).meta := io.meta + } + } + when(io.reclaim.valid) { + // @perf: would this require multiple write ports? + occupancyTable(io.reclaim.bits).id.valid := false.B // mark freed + } + io.peek := { + if (metadata.isDefined) occupancyTable(io.reclaim.bits).meta else 0.U + } + + when(io.gen && io.id.valid) { + when (!io.reclaim.valid) { + assert(outstanding < (1 << sourceWidth).U) + outstanding := outstanding + 1.U + } + }.elsewhen(io.reclaim.valid) { + assert(outstanding > 0.U) + outstanding := outstanding - 1.U + } + dontTouch(outstanding) +} + + // VortexTile has dmemNodes, imemNodes //Param and Key are used during SoC Generation @@ -21,6 +103,8 @@ case class VortexFatBankConfig( wordSize: Int, //This is the read/write granularity of the L1 cache cacheLineSize: Int, coreTagWidth: Int, + writeInfoReqQSize: Int, + mshrSize: Int ) { def coreTagPlusSizeWidth: Int = { log2Ceil(wordSize) + coreTagWidth @@ -31,6 +115,8 @@ object defaultFatBankConfig extends VortexFatBankConfig( wordSize = 16, cacheLineSize = 16, coreTagWidth = 8, + writeInfoReqQSize = 16, + mshrSize = 8 ) @@ -83,7 +169,8 @@ class VortexFatBankImp ( val vxCache = Module(new VX_cache( WORD_SIZE=config.wordSize, CACHE_LINE_SIZE=config.cacheLineSize, - CORE_TAG_WIDTH= config.coreTagPlusSizeWidth + CORE_TAG_WIDTH= config.coreTagPlusSizeWidth, + MSHR_SIZE= config.mshrSize ) ); @@ -94,16 +181,13 @@ class VortexFatBankImp ( val id = UInt(32.W) val size = UInt(32.W) } - - // assuming this is never full - val rcvWriteReqInfo = Module(new Queue((new WriteReqInfo).cloneType, 64, true, false)) - class ReadReqInfo(config: VortexFatBankConfig) extends Bundle { val size = UInt(log2Ceil(config.wordSize).W) val id = UInt(config.coreTagWidth.W) } + val rcvWriteReqInfo = Module(new Queue((new WriteReqInfo).cloneType, config.writeInfoReqQSize, true, false)) val readReqInfo = Wire(new ReadReqInfo(config)) // Translate TL request from Coalescer to requests for VX_cache @@ -112,16 +196,17 @@ class VortexFatBankImp ( // coal -> vxCache request on channel A val coalToBankA = coalToBankBundle.a; - coalToBankA.ready := vxCache.io.core_req_ready + coalToBankA.ready := vxCache.io.core_req_ready && rcvWriteReqInfo.io.enq.ready //not optimal vxCache.io.core_req_valid := coalToBankA.valid // read = 0, write = 1 vxCache.io.core_req_rw := !(coalToBankA.bits.opcode === TLMessages.Get) //4 is also hardcoded, it should be log2WordSize - vxCache.io.core_req_addr := coalToBankA.bits.address(31, 4) + vxCache.io.core_req_addr := coalToBankA.bits.address(31, log2Ceil(config.wordSize)) vxCache.io.core_req_byteen := coalToBankA.bits.mask vxCache.io.core_req_data := coalToBankA.bits.data + //combine size and tag field into one big wire, to put into vxCache.io.core_req_tag readReqInfo.id := coalToBankA.bits.source readReqInfo.size := coalToBankA.bits.size vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag) @@ -135,9 +220,7 @@ class VortexFatBankImp ( // if we don't send ack, the coalescer will run out of IDs, and can't generate new request // for read request, we send AckData when the FatBank has a valid output - // for write request, we can immediate Ack on the next clock cycle (not the same clock cycle, otherwise critical path too long) - // It's possible that on the same cycle, we need to do both "AckData" and "Ack" - // in this case, we always priorize "Ack", this makes the design easier + // for write request, we can ack whenever we have a valid entry in rcvWriteReqInfo Queue //I think this just shows the flaws of Tilelink. CPU never waits for an Ack upon regular write request //the Core should unconditionally move forward after every regular write request @@ -150,39 +233,29 @@ class VortexFatBankImp ( rcvWriteReqInfo.io.enq.bits.id := coalToBankA.bits.source rcvWriteReqInfo.io.enq.bits.size := coalToBankA.bits.size - - rcvWriteReqInfo.io.deq.ready := coalToBankD.ready + //prioritize Ack for Read, so we only deque from writeReqInfo, if we don't have a readReq we need to ack + //vxCache.io.core_rsp_valid means readDataAck + rcvWriteReqInfo.io.deq.ready := coalToBankD.ready && ~vxCache.io.core_rsp_valid - //if we "need" to do Ack - //we unconditionally set the vxCache.ready to be false, so it gets delayed - vxCache.io.core_rsp_ready := Mux( - rcvWriteReqInfo.io.deq.valid, - false.B, - coalToBankD.ready - ) - - coalToBankD.valid := Mux( - rcvWriteReqInfo.io.deq.valid, - true.B, - vxCache.io.core_rsp_valid - ) + vxCache.io.core_rsp_ready := coalToBankD.ready + coalToBankD.valid := vxCache.io.core_rsp_valid || rcvWriteReqInfo.io.deq.valid coalToBankD.bits.source := Mux( - rcvWriteReqInfo.io.deq.valid, - rcvWriteReqInfo.io.deq.bits.id, - vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id + vxCache.io.core_rsp_valid, + vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id, + rcvWriteReqInfo.io.deq.bits.id ) coalToBankD.bits.opcode := Mux( - rcvWriteReqInfo.io.deq.valid, - TLMessages.AccessAck, - TLMessages.AccessAckData + vxCache.io.core_rsp_valid, + TLMessages.AccessAckData, + TLMessages.AccessAck ) coalToBankD.bits.size := Mux( - rcvWriteReqInfo.io.deq.valid, - rcvWriteReqInfo.io.deq.bits.size, - vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size + vxCache.io.core_rsp_valid, + vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size, + rcvWriteReqInfo.io.deq.bits.size ) coalToBankD.bits.param := 0.U @@ -190,7 +263,7 @@ class VortexFatBankImp ( coalToBankD.bits.denied := false.B coalToBankD.bits.corrupt := false.B - coalToBankD.bits.data := vxCache.io.core_rsp_data + coalToBankD.bits.data := vxCache.io.core_rsp_data } @@ -200,10 +273,10 @@ class VortexFatBankImp ( //vx_cache can indeed guarantee that all active read operation has unique ID //However, since the cache is write_through, so it can't ensure unique ID for write operation //Therefore, we need our own internal source_ID generator for all write operation - // - //Now, we allocate id range: 0-15 for all write operation - // and 16-> above for read operation - val sourceGen = Module( new SourceGenerator(log2Ceil(16), ignoreInUse = false)) + + + val sourceGen = Module( new NewSourceGenerator(log2Ceil(config.mshrSize), metadata = Some(UInt(32.W)), ignoreInUse = false)) + // Translate VX_cache mem request to a TL request to be sent to L2 @@ -215,13 +288,9 @@ class VortexFatBankImp ( //Read Operation is ready as long as downstream L2 is ready - vxCache.io.mem_req_ready := vxCacheToL2A.ready + vxCache.io.mem_req_ready := vxCacheToL2A.ready - vxCacheToL2A.valid := Mux( - vxCache.io.mem_req_rw, - vxCache.io.mem_req_valid && sourceGen.io.id.valid, - vxCache.io.mem_req_valid - ) + vxCacheToL2A.valid := vxCache.io.mem_req_valid && sourceGen.io.id.valid vxCacheToL2A.bits.opcode := Mux( vxCache.io.mem_req_rw, @@ -230,21 +299,19 @@ class VortexFatBankImp ( ) vxCacheToL2A.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W)) + vxCacheToL2A.bits.mask := Mux( vxCache.io.mem_req_rw, vxCache.io.mem_req_byteen, 0xFFFF.U ) + vxCacheToL2A.bits.data := vxCache.io.mem_req_data - - vxCacheToL2A.bits.source := Mux( - vxCache.io.mem_req_rw, - sourceGen.io.id.bits, - vxCache.io.mem_req_tag + 16.U - ) - //mark current source_id as in-use - sourceGen.io.gen := vxCache.io.mem_req_rw && vxCacheToL2A.ready && vxCacheToL2A.valid + vxCacheToL2A.bits.source := sourceGen.io.id.bits + + sourceGen.io.gen := vxCacheToL2A.ready && vxCacheToL2A.valid + sourceGen.io.meta := vxCache.io.mem_req_tag //save the old read id vxCacheToL2A.bits.param := 0.U vxCacheToL2A.bits.size := 4.U @@ -257,10 +324,11 @@ class VortexFatBankImp ( vxCacheToL2D.ready := vxCache.io.mem_rsp_ready vxCache.io.mem_rsp_valid := vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAckData - vxCache.io.mem_rsp_tag := vxCacheToL2D.bits.source - 16.U // -16 for read resp, we can safely do this, since write-ack wouldn't pass through - vxCache.io.mem_rsp_data := vxCacheToL2D.bits.data + vxCache.io.mem_rsp_tag := sourceGen.io.peek + vxCache.io.mem_rsp_data := vxCacheToL2D.bits.data - sourceGen.io.reclaim.valid := vxCacheToL2D.ready && vxCacheToL2D.valid && vxCacheToL2D.bits.opcode === TLMessages.AccessAck + // all ids needs to be reclaimed + sourceGen.io.reclaim.valid := vxCacheToL2D.ready && vxCacheToL2D.valid sourceGen.io.reclaim.bits := vxCacheToL2D.bits.source }