From 6674e8c44a74dd314c8ce580a68bc4acf0faa38b Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 28 Aug 2021 21:34:06 -0700 Subject: [PATCH] cache bank area optimization + multi-porting fix for l2/l3 caches --- hw/rtl/VX_config.vh | 12 +- hw/rtl/Vortex.v | 1 + hw/rtl/cache/VX_bank.v | 146 +++++++++-------- hw/rtl/cache/VX_cache.v | 90 +++++++---- hw/rtl/cache/VX_cache_define.vh | 6 +- hw/rtl/cache/VX_core_req_bank_sel.v | 11 +- hw/rtl/cache/VX_core_rsp_merge.v | 234 ++++++++++++++++++++-------- hw/rtl/cache/VX_nc_bypass.v | 122 +++++++-------- hw/rtl/cache/VX_tag_access.v | 2 - 9 files changed, 388 insertions(+), 236 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index edc3e37e..5124116e 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -291,7 +291,7 @@ `define DNUM_BANKS `NUM_THREADS `endif -// Number of bank ports +// Number of ports per bank `ifndef DNUM_PORTS `define DNUM_PORTS 1 `endif @@ -361,6 +361,11 @@ `define L2NUM_BANKS `MIN(`NUM_CORES, 4) `endif +// Number of ports per bank +`ifndef L2NUM_PORTS +`define L2NUM_PORTS 1 +`endif + // Core Request Queue Size `ifndef L2CREQ_SIZE `define L2CREQ_SIZE 0 @@ -398,6 +403,11 @@ `define L3NUM_BANKS `MIN(`NUM_CLUSTERS, 4) `endif +// Number of ports per bank +`ifndef L3NUM_PORTS +`define L3NUM_PORTS 1 +`endif + // Core Request Queue Size `ifndef L3CREQ_SIZE `define L3CREQ_SIZE 0 diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 4d871e0e..f1be995d 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -85,6 +85,7 @@ module Vortex ( .CACHE_SIZE (`L3CACHE_SIZE), .CACHE_LINE_SIZE (`L3CACHE_LINE_SIZE), .NUM_BANKS (`L3NUM_BANKS), + .NUM_PORTS (`L3NUM_PORTS), .WORD_SIZE (`L3WORD_SIZE), .NUM_REQS (`L3NUM_REQS), .CREQ_SIZE (`L3CREQ_SIZE), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 6dda9a93..933f189e 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -39,7 +39,8 @@ module VX_bank #( // bank offset from beginning of index range parameter BANK_ADDR_OFFSET = 0, - localparam MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE) + localparam MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE), + localparam WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS) ) ( `SCOPE_IO_VX_bank @@ -56,13 +57,13 @@ module VX_bank #( // Core Request input wire core_req_valid, input wire [NUM_PORTS-1:0] core_req_pmask, - input wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] core_req_wsel, + input wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] core_req_wsel, input wire [NUM_PORTS-1:0][WORD_SIZE-1:0] core_req_byteen, input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_req_data, input wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_req_tid, + input wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, input wire core_req_rw, input wire [`LINE_ADDR_WIDTH-1:0] core_req_addr, - input wire [CORE_TAG_WIDTH-1:0] core_req_tag, output wire core_req_ready, // Core Response @@ -70,16 +71,17 @@ module VX_bank #( output wire [NUM_PORTS-1:0] core_rsp_pmask, output wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_rsp_tid, output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_rsp_data, - output wire [CORE_TAG_WIDTH-1:0] core_rsp_tag, + output wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, input wire core_rsp_ready, // Memory request output wire mem_req_valid, output wire mem_req_rw, - output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen, + output wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen, + output wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel, output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr, output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, - output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data, + output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data, input wire mem_req_ready, // Memory response @@ -104,18 +106,18 @@ module VX_bank #( `endif wire [NUM_PORTS-1:0] creq_pmask; - wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] creq_wsel; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel; wire [NUM_PORTS-1:0][WORD_SIZE-1:0] creq_byteen; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] creq_tid; + wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] creq_tag; wire creq_rw; wire [`LINE_ADDR_WIDTH-1:0] creq_addr; - wire [CORE_TAG_WIDTH-1:0] creq_tag; - + wire creq_valid, creq_ready; VX_elastic_buffer #( - .DATAW (CORE_TAG_WIDTH + 1 + `LINE_ADDR_WIDTH + (1 + `UP(`WORD_SELECT_BITS) + WORD_SIZE + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), + .DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)), .SIZE (CREQ_SIZE), .OUTPUT_REG (CREQ_SIZE > 2) ) core_req_queue ( @@ -123,8 +125,8 @@ module VX_bank #( .reset (reset), .ready_in (core_req_ready), .valid_in (core_req_valid), - .data_in ({core_req_tag, core_req_rw, core_req_addr, core_req_pmask, core_req_wsel, core_req_byteen, core_req_data, core_req_tid}), - .data_out ({creq_tag, creq_rw, creq_addr, creq_pmask, creq_wsel, creq_byteen, creq_data, creq_tid}), + .data_in ({core_req_rw, core_req_addr, core_req_pmask, core_req_wsel, core_req_byteen, core_req_data, core_req_tid, core_req_tag}), + .data_out ({creq_rw, creq_addr, creq_pmask, creq_wsel, creq_byteen, creq_data, creq_tid, creq_tag}), .ready_out (creq_ready), .valid_out (creq_valid) ); @@ -134,35 +136,33 @@ module VX_bank #( wire mshr_valid; wire [MSHR_ADDR_WIDTH-1:0] mshr_dequeue_id; wire [`LINE_ADDR_WIDTH-1:0] mshr_addr; - wire [CORE_TAG_WIDTH-1:0] mshr_tag; - wire [NUM_PORTS-1:0] mshr_pmask; - wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] mshr_wsel; + wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] mshr_tag; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mshr_wsel; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] mshr_tid; + wire [NUM_PORTS-1:0] mshr_pmask; wire [`LINE_ADDR_WIDTH-1:0] addr_st0, addr_st1; - wire mem_rw_st0, mem_rw_st1; - wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] wsel_st0, wsel_st1; + wire write_st0, write_st1; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] wsel_st0, wsel_st1; wire [NUM_PORTS-1:0][WORD_SIZE-1:0] byteen_st0, byteen_st1; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] req_tid_st0, req_tid_st1; wire [NUM_PORTS-1:0] pmask_st0, pmask_st1; + wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] tag_st0, tag_st1; wire [`CACHE_LINE_WIDTH-1:0] rdata_st1; wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1; wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1; - wire [CORE_TAG_WIDTH-1:0] tag_st0, tag_st1; wire valid_st0, valid_st1; wire is_fill_st0, is_fill_st1; wire is_mshr_st0, is_mshr_st1; wire miss_st0, miss_st1; - wire writeen_unqual_st1; wire is_flush_st0; wire mshr_pending_st0, mshr_pending_st1; wire crsq_valid, crsq_ready, crsq_stall; wire mreq_alm_full; - - wire creq_fire = creq_valid && creq_ready; - - wire fill_in_st0 = valid_st0 && is_fill_st0; + + wire rdw_fill_hazard = valid_st0 && is_fill_st0; + wire rdw_write_hazard = valid_st0 && write_st0 && ~creq_rw; // determine which queue to pop next in priority order wire mshr_grant = 1; @@ -174,24 +174,25 @@ module VX_bank #( wire creq_grant = !mshr_enable && !mrsq_enable && !flush_enable; wire mshr_ready = mshr_grant - && !fill_in_st0 // prevent tag read-during-write with fill - && !crsq_stall; // ensure core response ready + && !rdw_fill_hazard // prevent read-during-write + && !crsq_stall; // ensure core response ready assign mem_rsp_ready = mrsq_grant - && !crsq_stall; // ensure core response ready + && !crsq_stall; // ensure core response ready - assign creq_ready = creq_grant - && !mreq_alm_full // ensure memory request ready - && !mshr_alm_full // ensure mshr enqueue ready - && !crsq_stall; // ensure core response ready - - wire mshr_fire = mshr_valid && mshr_ready; + assign creq_ready = creq_grant + && !rdw_write_hazard // prevent read-during-write + && !mreq_alm_full // ensure memory request ready + && !mshr_alm_full // ensure mshr enqueue ready + && !crsq_stall; // ensure core response ready + wire mshr_fire = mshr_valid && mshr_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; + wire creq_fire = creq_valid && creq_ready; `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[`CACHE_REQ_INFO_RNG] : creq_tag[`CACHE_REQ_INFO_RNG]; + assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[0][`CACHE_REQ_INFO_RNG] : creq_tag[0][`CACHE_REQ_INFO_RNG]; end else begin assign {debug_wid_sel, debug_pc_sel} = 0; end @@ -219,7 +220,7 @@ module VX_bank #( end VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH + MSHR_ADDR_WIDTH), + .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH), .RESETW (1) ) pipe_reg0 ( .clk (clk), @@ -230,7 +231,7 @@ module VX_bank #( flush_enable, mrsq_enable || flush_enable, mshr_enable, - mshr_enable ? 1'b0 : creq_rw, + creq_fire && creq_rw, mshr_enable ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)), (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data : creq_line_data, mshr_enable ? mshr_wsel : creq_wsel, @@ -240,12 +241,12 @@ module VX_bank #( mshr_enable ? mshr_tag : creq_tag, mshr_enable ? mshr_dequeue_id : (mem_rsp_valid ? mem_rsp_id : mshr_alloc_id) }), - .data_out ({valid_st0, is_flush_st0, is_fill_st0, is_mshr_st0, mem_rw_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0}) + .data_out ({valid_st0, is_flush_st0, is_fill_st0, is_mshr_st0, write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0}) ); `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st0, debug_pc_st0} = tag_st0[`CACHE_REQ_INFO_RNG]; + assign {debug_wid_st0, debug_pc_st0} = tag_st0[0][`CACHE_REQ_INFO_RNG]; end else begin assign {debug_wid_st0, debug_pc_st0} = 0; end @@ -286,35 +287,33 @@ module VX_bank #( assign miss_st0 = !is_fill_st0 && !tag_match_st0; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH + MSHR_ADDR_WIDTH + 1), + .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (!crsq_stall), - .data_in ({valid_st0, is_fill_st0, is_mshr_st0, is_fill_st0, miss_st0, mem_rw_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0, mshr_pending_st0}), - .data_out ({valid_st1, is_fill_st1, is_mshr_st1, writeen_unqual_st1, miss_st1, mem_rw_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1}) + .data_in ({valid_st0, is_fill_st0, is_mshr_st0, miss_st0, write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0, mshr_pending_st0}), + .data_out ({valid_st1, is_fill_st1, is_mshr_st1, miss_st1, write_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1}) ); `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st1, debug_pc_st1} = tag_st1[`CACHE_REQ_INFO_RNG]; + assign {debug_wid_st1, debug_pc_st1} = tag_st1[0][`CACHE_REQ_INFO_RNG]; end else begin assign {debug_wid_st1, debug_pc_st1} = 0; end `endif - wire writeen_st1 = (WRITE_ENABLE && !is_fill_st1 && mem_rw_st1 && !miss_st1) - || writeen_unqual_st1; + wire read_st1 = !is_fill_st1 && !write_st1; - wire readen_st1 = !is_fill_st1 && !mem_rw_st1; + wire writeen_st1 = (WRITE_ENABLE && write_st1 && !miss_st1) + || is_fill_st1; - wire crsq_push_st1 = readen_st1 && !miss_st1; - - wire do_writeback_st1 = !is_fill_st1 && mem_rw_st1; + wire crsq_push_st1 = read_st1 && !miss_st1; - wire mreq_push_st1 = (readen_st1 && miss_st1 && !mshr_pending_st1) - || do_writeback_st1; + wire mreq_push_st1 = (read_st1 && miss_st1 && !mshr_pending_st1) + || write_st1; wire [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] line_byteen_st1; @@ -356,7 +355,7 @@ module VX_bank #( .addr (addr_st1), // reading - .readen (valid_st1 && readen_st1), + .readen (valid_st1 && read_st1), .rdata (rdata_st1), // writing @@ -368,8 +367,8 @@ module VX_bank #( wire mshr_allocate = creq_fire && ~creq_rw; wire mshr_replay = do_fill_st0 && ~crsq_stall; - wire mshr_lookup = valid_st0 && !is_fill_st0 && ~is_mshr_st0 && ~mem_rw_st0 && ~crsq_stall; - wire mshr_release = valid_st1 && readen_st1 && ~is_mshr_st1 && ~miss_st1 && ~crsq_stall; + wire mshr_lookup = valid_st0 && ~write_st0 && ~is_mshr_st0 && ~crsq_stall; + wire mshr_release = valid_st1 && read_st1 && ~is_mshr_st1 && ~miss_st1 && ~crsq_stall; wire mshr_not_full; @@ -433,7 +432,7 @@ module VX_bank #( wire [NUM_PORTS-1:0] crsq_pmask; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] crsq_data; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] crsq_tid; - wire [CORE_TAG_WIDTH-1:0] crsq_tag; + wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] crsq_tag; assign crsq_valid = valid_st1 && crsq_push_st1; assign crsq_stall = crsq_valid && !crsq_ready; @@ -451,7 +450,7 @@ module VX_bank #( end VX_elastic_buffer #( - .DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), + .DATAW ((CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), .SIZE (CRSQ_SIZE), .OUTPUT_REG (1 == NUM_BANKS) ) core_rsp_req ( @@ -467,24 +466,37 @@ module VX_bank #( // Enqueue memory request - wire [CACHE_LINE_SIZE-1:0] mreq_byteen; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mreq_data; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mreq_byteen; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mreq_wsel; wire [`LINE_ADDR_WIDTH-1:0] mreq_addr; wire [MSHR_ADDR_WIDTH-1:0] mreq_id; - wire [`CACHE_LINE_WIDTH-1:0] mreq_data; + wire mreq_push, mreq_pop, mreq_empty, mreq_rw; assign mreq_push = valid_st1 && mreq_push_st1; assign mreq_pop = mem_req_valid && mem_req_ready; - assign mreq_rw = WRITE_ENABLE && do_writeback_st1; - assign mreq_byteen = mreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}}; - assign mreq_addr = addr_st1; - assign mreq_id = mshr_id_st1; - assign mreq_data = wdata_st1; + assign mreq_rw = WRITE_ENABLE && write_st1; + assign mreq_addr = addr_st1; + assign mreq_id = mshr_id_st1; + assign mreq_wsel = wsel_st1; + + if (NUM_PORTS > 1) begin + for (genvar p = 0; p < NUM_PORTS; ++p) begin + assign mreq_byteen[p] = pmask_st1[p] ? byteen_st1[p] : WORD_SIZE'(0); + end + end else begin + assign mreq_byteen[0] = byteen_st1[0]; + end + + for (genvar p = 0; p < NUM_PORTS; ++p) begin + assign mreq_data[p] = wdata_st1[wsel_st1[p] * `WORD_WIDTH +: `WORD_WIDTH]; + end VX_fifo_queue #( - .DATAW (1 + CACHE_LINE_SIZE + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + `CACHE_LINE_WIDTH), + .DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), .SIZE (MREQ_SIZE), .ALM_FULL (MREQ_SIZE-2) ) mem_req_queue ( @@ -492,8 +504,8 @@ module VX_bank #( .reset (reset), .push (mreq_push), .pop (mreq_pop), - .data_in ({mreq_rw, mreq_byteen, mreq_addr, mreq_id, mreq_data}), - .data_out ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_id, mem_req_data}), + .data_in ({mreq_rw, mreq_addr, mreq_id, mreq_byteen, mreq_wsel, mreq_data}), + .data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}), .empty (mreq_empty), .alm_full (mreq_alm_full), `UNUSED_PIN (full), @@ -515,8 +527,8 @@ module VX_bank #( `SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); `ifdef PERF_ENABLE - assign perf_read_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && !mem_rw_st1; - assign perf_write_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && mem_rw_st1; + assign perf_read_misses = valid_st1 && read_st1 && !is_mshr_st1 && miss_st1; + assign perf_write_misses = valid_st1 && write_st1 && !is_mshr_st1 && miss_st1; assign perf_pipe_stalls = crsq_stall || mreq_alm_full || mshr_alm_full; assign perf_mshr_stalls = mshr_alm_full; `endif @@ -550,7 +562,7 @@ module VX_bank #( dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1); end if (mreq_push) begin - if (do_writeback_st1) + if (write_st1) dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1); else dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, debug_wid_st1, debug_pc_st1); diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index eab2004e..2429d5a6 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -44,7 +44,9 @@ module VX_cache #( parameter BANK_ADDR_OFFSET = 0, // enable bypass for non-cacheable addresses - parameter NC_ENABLE = 0 + parameter NC_ENABLE = 0, + + localparam WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS) ) ( `SCOPE_IO_VX_cache @@ -105,6 +107,29 @@ module VX_cache #( /////////////////////////////////////////////////////////////////////////// + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_p; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_p; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_p; + + reg [CACHE_LINE_SIZE-1:0] mem_req_byteen_r; + reg [`CACHE_LINE_WIDTH-1:0] mem_req_data_r; + + always @(*) begin + mem_req_byteen_r = 0; + mem_req_data_r = 'x; + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (mem_req_byteen_p[p] != 0) begin + mem_req_byteen_r[mem_req_wsel_p[p] * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p[p]; + mem_req_data_r[mem_req_wsel_p[p] * `WORD_WIDTH +: `WORD_WIDTH] = mem_req_data_p[p]; + end + end + end + + assign mem_req_byteen = mem_req_byteen_r; + assign mem_req_data = mem_req_data_r; + + /////////////////////////////////////////////////////////////////////////// + // Core request wire [NUM_REQS-1:0] core_req_valid_nc; wire [NUM_REQS-1:0] core_req_rw_nc; @@ -124,9 +149,10 @@ module VX_cache #( // Memory request wire mem_req_valid_nc; wire mem_req_rw_nc; - wire [CACHE_LINE_SIZE-1:0] mem_req_byteen_nc; wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc; - wire [`CACHE_LINE_WIDTH-1:0] mem_req_data_nc; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc; wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc; wire mem_req_ready_nc; @@ -138,6 +164,7 @@ module VX_cache #( if (NC_ENABLE) begin VX_nc_bypass #( + .NUM_PORTS (NUM_PORTS), .NUM_REQS (NUM_REQS), .NUM_RSP_TAGS (`CORE_RSP_TAGS), .NC_TAG_BIT (0), @@ -147,7 +174,7 @@ module VX_cache #( .CORE_TAG_IN_WIDTH (CORE_TAG_WIDTH), .MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH), - .MEM_DATA_SIZE (CACHE_LINE_SIZE), + .MEM_DATA_SIZE (CACHE_LINE_SIZE), .MEM_TAG_IN_WIDTH (MEM_TAG_IN_WIDTH), .MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH) ) nc_bypass ( @@ -188,19 +215,21 @@ module VX_cache #( // Memory request in .mem_req_valid_in (mem_req_valid_nc), - .mem_req_rw_in (mem_req_rw_nc), - .mem_req_byteen_in (mem_req_byteen_nc), + .mem_req_rw_in (mem_req_rw_nc), .mem_req_addr_in (mem_req_addr_nc), + .mem_req_byteen_in (mem_req_byteen_nc), + .mem_req_wsel_in (mem_req_wsel_nc), .mem_req_data_in (mem_req_data_nc), .mem_req_tag_in (mem_req_tag_nc), .mem_req_ready_in (mem_req_ready_nc), // Memory request out .mem_req_valid_out (mem_req_valid), - .mem_req_rw_out (mem_req_rw), - .mem_req_byteen_out (mem_req_byteen), + .mem_req_rw_out (mem_req_rw), .mem_req_addr_out (mem_req_addr), - .mem_req_data_out (mem_req_data), + .mem_req_byteen_out (mem_req_byteen_p), + .mem_req_wsel_out (mem_req_wsel_p), + .mem_req_data_out (mem_req_data_p), .mem_req_tag_out (mem_req_tag), .mem_req_ready_out (mem_req_ready), @@ -234,8 +263,9 @@ module VX_cache #( assign mem_req_valid = mem_req_valid_nc; assign mem_req_rw = mem_req_rw_nc; assign mem_req_addr = mem_req_addr_nc; - assign mem_req_byteen = mem_req_byteen_nc; - assign mem_req_data = mem_req_data_nc; + assign mem_req_byteen_p = mem_req_byteen_nc; + assign mem_req_wsel_p = mem_req_wsel_nc; + assign mem_req_data_p = mem_req_data_nc; assign mem_req_tag = mem_req_tag_nc; assign mem_req_ready_nc = mem_req_ready; @@ -293,28 +323,29 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_core_req_valid; wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_pmask; - wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] per_bank_core_req_wsel; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_req_tag; wire [NUM_BANKS-1:0] per_bank_core_req_rw; - wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr; - wire [NUM_BANKS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_req_tag; + wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr; wire [NUM_BANKS-1:0] per_bank_core_req_ready; wire [NUM_BANKS-1:0] per_bank_core_rsp_valid; wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_pmask; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_rsp_tid; - wire [NUM_BANKS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_rsp_tag; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_rsp_tag; wire [NUM_BANKS-1:0] per_bank_core_rsp_ready; wire [NUM_BANKS-1:0] per_bank_mem_req_valid; wire [NUM_BANKS-1:0] per_bank_mem_req_rw; - wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_mem_req_byteen; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] per_bank_mem_req_wsel; wire [NUM_BANKS-1:0][`MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr; wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id; - wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_mem_req_data; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_mem_req_data; wire [NUM_BANKS-1:0] per_bank_mem_req_ready; wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready; @@ -365,28 +396,29 @@ module VX_cache #( for (genvar i = 0; i < NUM_BANKS; i++) begin wire curr_bank_core_req_valid; wire [NUM_PORTS-1:0] curr_bank_core_req_pmask; - wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] curr_bank_core_req_wsel; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] curr_bank_core_req_wsel; wire [NUM_PORTS-1:0][WORD_SIZE-1:0] curr_bank_core_req_byteen; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_core_req_data; - wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_req_tid; + wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_req_tid; + wire [NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] curr_bank_core_req_tag; wire curr_bank_core_req_rw; - wire [`LINE_ADDR_WIDTH-1:0] curr_bank_core_req_addr; - wire [CORE_TAG_X_WIDTH-1:0] curr_bank_core_req_tag; + wire [`LINE_ADDR_WIDTH-1:0] curr_bank_core_req_addr; wire curr_bank_core_req_ready; wire curr_bank_core_rsp_valid; wire [NUM_PORTS-1:0] curr_bank_core_rsp_pmask; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_core_rsp_data; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_rsp_tid; - wire [CORE_TAG_X_WIDTH-1:0] curr_bank_core_rsp_tag; + wire [NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] curr_bank_core_rsp_tag; wire curr_bank_core_rsp_ready; wire curr_bank_mem_req_valid; wire curr_bank_mem_req_rw; - wire [CACHE_LINE_SIZE-1:0] curr_bank_mem_req_byteen; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] curr_bank_mem_req_byteen; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] curr_bank_mem_req_wsel; wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr; wire [MSHR_ADDR_WIDTH-1:0] curr_bank_mem_req_id; - wire[`CACHE_LINE_WIDTH-1:0] curr_bank_mem_req_data; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_mem_req_data; wire curr_bank_mem_req_ready; wire curr_bank_mem_rsp_valid; @@ -419,6 +451,7 @@ module VX_cache #( assign per_bank_mem_req_valid[i] = curr_bank_mem_req_valid; assign per_bank_mem_req_rw[i] = curr_bank_mem_req_rw; assign per_bank_mem_req_byteen[i] = curr_bank_mem_req_byteen; + assign per_bank_mem_req_wsel[i] = curr_bank_mem_req_wsel; if (NUM_BANKS == 1) begin assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr; end else begin @@ -496,6 +529,7 @@ module VX_cache #( .mem_req_valid (curr_bank_mem_req_valid), .mem_req_rw (curr_bank_mem_req_rw), .mem_req_byteen (curr_bank_mem_req_byteen), + .mem_req_wsel (curr_bank_mem_req_wsel), .mem_req_addr (curr_bank_mem_req_addr), .mem_req_id (curr_bank_mem_req_id), .mem_req_data (curr_bank_mem_req_data), @@ -538,9 +572,9 @@ module VX_cache #( .core_rsp_ready (core_rsp_ready_nc) ); - wire [NUM_BANKS-1:0][(MEM_TAG_IN_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in; + wire [NUM_BANKS-1:0][(MEM_TAG_IN_WIDTH + 1 + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in; for (genvar i = 0; i < NUM_BANKS; i++) begin - assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_id[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_data[i]}; + assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_id[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_wsel[i], per_bank_mem_req_data[i]}; end wire [MSHR_ADDR_WIDTH-1:0] mem_req_id; @@ -549,7 +583,7 @@ module VX_cache #( VX_stream_arbiter #( .NUM_REQS (NUM_BANKS), - .DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH), + .DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), .BUFFERED (1) ) mem_req_arb ( .clk (clk), @@ -558,7 +592,7 @@ module VX_cache #( .data_in (data_in), .ready_in (per_bank_mem_req_ready), .valid_out (mem_req_valid_nc), - .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_byteen_nc, mem_req_data_nc}), + .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}), .ready_out (mem_req_ready_nc) ); diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 52f4f06a..4679c642 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -9,8 +9,10 @@ `define REQS_BITS `LOG2UP(NUM_REQS) -// tag valid tid word_sel -`define MSHR_DATA_WIDTH (CORE_TAG_WIDTH + (1 + `REQS_BITS + `UP(`WORD_SELECT_BITS)) * NUM_PORTS) +`define PORTS_BITS `LOG2UP(NUM_PORTS) + +// tag valid tid word_sel +`define MSHR_DATA_WIDTH ((CORE_TAG_WIDTH + 1 + `REQS_BITS + `UP(`WORD_SELECT_BITS)) * NUM_PORTS) `define WORD_WIDTH (8 * WORD_SIZE) diff --git a/hw/rtl/cache/VX_core_req_bank_sel.v b/hw/rtl/cache/VX_core_req_bank_sel.v index 2ff9616e..f09aaea2 100644 --- a/hw/rtl/cache/VX_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_core_req_bank_sel.v @@ -43,7 +43,7 @@ module VX_core_req_bank_sel #( output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen, output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data, output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid, - output wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag, + output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag, input wire [`BANK_READY_COUNT-1:0] per_bank_core_req_ready ); `UNUSED_PARAM (CACHE_ID) @@ -80,9 +80,9 @@ module VX_core_req_bank_sel #( reg [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen_r; reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data_r; reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid_r; + reg [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_r; reg [NUM_BANKS-1:0] per_bank_core_req_rw_r; reg [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr_r; - reg [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_r; reg [NUM_REQS-1:0] core_req_ready_r; if (NUM_REQS > 1) begin @@ -129,10 +129,9 @@ module VX_core_req_bank_sel #( per_bank_core_req_byteen_r[core_req_bid[i]][i % NUM_PORTS] = core_req_byteen[i]; per_bank_core_req_data_r[core_req_bid[i]][i % NUM_PORTS] = core_req_data[i]; per_bank_core_req_tid_r[core_req_bid[i]][i % NUM_PORTS] = `REQS_BITS'(i); + per_bank_core_req_tag_r[core_req_bid[i]][i % NUM_PORTS] = core_req_tag[i]; per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i]; per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i]; - per_bank_core_req_tag_r[core_req_bid[i]] = core_req_tag[i]; - req_select_table_r[core_req_bid[i]][i % NUM_PORTS] = (1 << i); end end @@ -177,9 +176,9 @@ module VX_core_req_bank_sel #( per_bank_core_req_byteen_r[core_req_bid[i]][i % NUM_PORTS] = core_req_byteen[i]; per_bank_core_req_data_r[core_req_bid[i]][i % NUM_PORTS] = core_req_data[i]; per_bank_core_req_tid_r[core_req_bid[i]][i % NUM_PORTS] = `REQS_BITS'(i); + per_bank_core_req_tag_r[core_req_bid[i]][i % NUM_PORTS] = core_req_tag[i]; per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i]; - per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i]; - per_bank_core_req_tag_r[core_req_bid[i]] = core_req_tag[i]; + per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i]; end end end diff --git a/hw/rtl/cache/VX_core_rsp_merge.v b/hw/rtl/cache/VX_core_rsp_merge.v index 14823b0d..6fe84690 100644 --- a/hw/rtl/cache/VX_core_rsp_merge.v +++ b/hw/rtl/cache/VX_core_rsp_merge.v @@ -24,7 +24,7 @@ module VX_core_rsp_merge #( input wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_pmask, input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data, input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_rsp_tid, - input wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag, + input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag, output wire [NUM_BANKS-1:0] per_bank_core_rsp_ready, // Core Response @@ -40,7 +40,7 @@ module VX_core_rsp_merge #( reg [NUM_REQS-1:0] core_rsp_valid_unqual; reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual; - reg [NUM_BANKS-1:0] core_rsp_bank_select; + reg [NUM_BANKS-1:0] per_bank_core_rsp_ready_r; if (CORE_TAG_ID_BITS != 0) begin @@ -51,61 +51,101 @@ module VX_core_rsp_merge #( reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; wire core_rsp_ready_unqual; - always @(*) begin - core_rsp_tag_unqual = 'x; - for (integer i = NUM_BANKS-1; i >= 0; --i) begin - if (per_bank_core_rsp_valid[i]) begin - core_rsp_tag_unqual = per_bank_core_rsp_tag[i]; - end - end - end - if (NUM_PORTS > 1) begin - always @(*) begin - core_rsp_valid_unqual = 0; - core_rsp_data_unqual = 'x; - core_rsp_bank_select = 0; - - for (integer i = 0; i < NUM_BANKS; i++) begin - for (integer p = 0; p < NUM_PORTS; p++) begin - if (per_bank_core_rsp_valid[i] - && per_bank_core_rsp_pmask[i][p] - && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin - core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1; - core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p]; - core_rsp_bank_select[i] = core_rsp_ready_unqual; + reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_r, per_bank_core_rsp_sent; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_n; + + for (genvar i = 0; i < NUM_BANKS; ++i) begin + assign per_bank_core_rsp_sent_n[i] = per_bank_core_rsp_sent_r[i] | per_bank_core_rsp_sent[i]; + end + + always @(posedge clk) begin + if (reset) begin + per_bank_core_rsp_sent_r <= '0; + end else begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + if (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]) begin + per_bank_core_rsp_sent_r[i] <= '0; + end else begin + per_bank_core_rsp_sent_r[i] <= per_bank_core_rsp_sent_n[i]; end end end end + always @(*) begin + core_rsp_tag_unqual = 'x; + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (per_bank_core_rsp_valid[i] + && per_bank_core_rsp_pmask[i][p] + && !per_bank_core_rsp_sent_r[i][p]) begin + core_rsp_tag_unqual = per_bank_core_rsp_tag[i][p]; + end + end + end + end + + always @(*) begin + core_rsp_valid_unqual = 0; + core_rsp_data_unqual = 'x; + per_bank_core_rsp_sent = 0; + + for (integer i = 0; i < NUM_BANKS; ++i) begin + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (per_bank_core_rsp_valid[i] + && per_bank_core_rsp_pmask[i][p] + && !per_bank_core_rsp_sent_r[i][p] + && (per_bank_core_rsp_tag[i][p][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin + core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1; + core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p]; + per_bank_core_rsp_sent[i][p] = core_rsp_ready_unqual; + end + end + end + end + + always @(*) begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + per_bank_core_rsp_ready_r[i] = (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]); + end + end + end else begin `UNUSED_VAR (per_bank_core_rsp_pmask) - - always @(*) begin - core_rsp_valid_unqual = 0; - core_rsp_data_unqual = 'x; - core_rsp_bank_select = 0; - - for (integer i = 0; i < NUM_BANKS; i++) begin - if (per_bank_core_rsp_valid[i] - && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin - core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; - core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - core_rsp_bank_select[i] = core_rsp_ready_unqual; + + always @(*) begin + core_rsp_tag_unqual = 'x; + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + if (per_bank_core_rsp_valid[i]) begin + core_rsp_tag_unqual = per_bank_core_rsp_tag[i]; end end end - - end + + always @(*) begin + core_rsp_valid_unqual = 0; + core_rsp_data_unqual = 'x; + per_bank_core_rsp_ready_r = 0; + + for (integer i = 0; i < NUM_BANKS; i++) begin + if (per_bank_core_rsp_valid[i] + && (per_bank_core_rsp_tag[i][0][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin + core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; + core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; + per_bank_core_rsp_ready_r[i] = core_rsp_ready_unqual; + end + end + end + end wire core_rsp_valid_any = (| per_bank_core_rsp_valid); VX_skid_buffer #( .DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)) - ) pipe_reg ( + ) skid_buf ( .clk (clk), .reset (reset), .valid_in (core_rsp_valid_any), @@ -118,40 +158,102 @@ module VX_core_rsp_merge #( end else begin - `UNUSED_VAR (per_bank_core_rsp_pmask) - reg [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; - reg [NUM_REQS-1:0][NUM_BANKS-1:0] bank_select_table; - wire [NUM_REQS-1:0] core_rsp_ready_unqual; - always @(*) begin - core_rsp_valid_unqual = 0; - core_rsp_tag_unqual = 'x; - core_rsp_data_unqual = 'x; - bank_select_table = 'x; - - for (integer i = NUM_BANKS-1; i >= 0; --i) begin - if (per_bank_core_rsp_valid[i]) begin - core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; - core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i]; - core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - bank_select_table[per_bank_core_rsp_tid[i]] = (1 << i); - end - end - end + if (NUM_PORTS > 1) begin - always @(*) begin - for (integer i = 0; i < NUM_BANKS; i++) begin - core_rsp_bank_select[i] = core_rsp_ready_unqual[per_bank_core_rsp_tid[i]] - && bank_select_table[per_bank_core_rsp_tid[i]][i]; - end + reg [NUM_REQS-1:0][(`PORTS_BITS + `BANK_SELECT_BITS)-1:0] bank_select_table; + + reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_r, per_bank_core_rsp_sent; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_n; + + for (genvar i = 0; i < NUM_BANKS; ++i) begin + assign per_bank_core_rsp_sent_n[i] = per_bank_core_rsp_sent_r[i] | per_bank_core_rsp_sent[i]; + end + + always @(posedge clk) begin + if (reset) begin + per_bank_core_rsp_sent_r <= '0; + end else begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + if (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]) begin + per_bank_core_rsp_sent_r[i] <= '0; + end else begin + per_bank_core_rsp_sent_r[i] <= per_bank_core_rsp_sent_n[i]; + end + end + end + end + + always @(*) begin + core_rsp_valid_unqual = '0; + core_rsp_tag_unqual = 'x; + core_rsp_data_unqual = 'x; + bank_select_table = 'x; + + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (per_bank_core_rsp_valid[i] + && per_bank_core_rsp_pmask[i][p] + && !per_bank_core_rsp_sent_r[i][p]) begin + core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1; + core_rsp_tag_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_tag[i][p]; + core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p]; + bank_select_table[per_bank_core_rsp_tid[i][p]] = {`PORTS_BITS'(p), `BANK_SELECT_BITS'(i)}; + end + end + end + end + + always @(*) begin + per_bank_core_rsp_sent = '0; + for (integer i = 0; i < NUM_REQS; i++) begin + if (core_rsp_valid_unqual[i]) begin + per_bank_core_rsp_sent[bank_select_table[i][0 +: `BANK_SELECT_BITS]][bank_select_table[i][`BANK_SELECT_BITS +: `PORTS_BITS]] = core_rsp_ready_unqual[i]; + end + end + end + + always @(*) begin + for (integer i = 0; i < NUM_BANKS; i++) begin + per_bank_core_rsp_ready_r[i] = (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]); + end + end + + end else begin + + `UNUSED_VAR (per_bank_core_rsp_pmask) + reg [NUM_REQS-1:0][NUM_BANKS-1:0] bank_select_table; + + always @(*) begin + core_rsp_valid_unqual = 0; + core_rsp_tag_unqual = 'x; + core_rsp_data_unqual = 'x; + bank_select_table = 'x; + + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + if (per_bank_core_rsp_valid[i]) begin + core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; + core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i]; + core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; + bank_select_table[per_bank_core_rsp_tid[i][i]] = (1 << i); + end + end + end + + always @(*) begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + per_bank_core_rsp_ready_r[i] = core_rsp_ready_unqual[per_bank_core_rsp_tid[i]] + && bank_select_table[per_bank_core_rsp_tid[i]][i]; + end + end end for (genvar i = 0; i < NUM_REQS; i++) begin VX_skid_buffer #( .DATAW (CORE_TAG_WIDTH + `WORD_WIDTH) - ) pipe_reg ( + ) skid_buf ( .clk (clk), .reset (reset), .valid_in (core_rsp_valid_unqual[i]), @@ -167,9 +269,7 @@ module VX_core_rsp_merge #( end - for (genvar i = 0; i < NUM_BANKS; i++) begin - assign per_bank_core_rsp_ready[i] = core_rsp_bank_select[i]; - end + assign per_bank_core_rsp_ready = per_bank_core_rsp_ready_r; end else begin diff --git a/hw/rtl/cache/VX_nc_bypass.v b/hw/rtl/cache/VX_nc_bypass.v index 93ef5847..f1e19df7 100644 --- a/hw/rtl/cache/VX_nc_bypass.v +++ b/hw/rtl/cache/VX_nc_bypass.v @@ -1,6 +1,7 @@ `include "VX_cache_define.vh" module VX_nc_bypass #( + parameter NUM_PORTS = 1, parameter NUM_REQS = 1, parameter NUM_RSP_TAGS = 0, parameter NC_TAG_BIT = 0, @@ -10,13 +11,14 @@ module VX_nc_bypass #( parameter CORE_TAG_IN_WIDTH = 1, parameter MEM_ADDR_WIDTH = 1, - parameter MEM_DATA_SIZE = 1, + parameter MEM_DATA_SIZE = 1, parameter MEM_TAG_IN_WIDTH = 1, parameter MEM_TAG_OUT_WIDTH = 1, - localparam CORE_DATA_WIDTH = CORE_DATA_SIZE * 8, - localparam MEM_DATA_WIDTH = MEM_DATA_SIZE * 8, - localparam CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - 1 + localparam CORE_DATA_WIDTH = CORE_DATA_SIZE * 8, + localparam MEM_DATA_WIDTH = MEM_DATA_SIZE * 8, + localparam CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - 1, + localparam MEM_SELECT_BITS = `UP(`CLOG2(MEM_DATA_SIZE / CORE_DATA_SIZE)) ) ( input wire clk, input wire reset, @@ -57,8 +59,9 @@ module VX_nc_bypass #( input wire mem_req_valid_in, input wire mem_req_rw_in, input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in, - input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in, - input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in, + input wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in, + input wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in, + input wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in, input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in, output wire mem_req_ready_in, @@ -66,8 +69,9 @@ module VX_nc_bypass #( output wire mem_req_valid_out, output wire mem_req_rw_out, output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out, - output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out, - output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out, + output wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_out, + output wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_out, + output wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_out, output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out, input wire mem_req_ready_out, @@ -148,7 +152,7 @@ module VX_nc_bypass #( assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid; assign mem_req_ready_in = mem_req_ready_out; - wire [(MEM_TAG_IN_WIDTH+1)-1:0] mem_req_tag_in_nc; + wire [(MEM_TAG_IN_WIDTH+1)-1:0] mem_req_tag_in_c; VX_bits_insert #( .N (MEM_TAG_IN_WIDTH), @@ -157,74 +161,66 @@ module VX_nc_bypass #( ) mem_req_tag_insert ( .data_in (mem_req_tag_in), .sel_in ('0), - .data_out (mem_req_tag_in_nc) + .data_out (mem_req_tag_in_c) ); + wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel; + wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel; + wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel; + wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel; + wire core_req_rw_in_sel; + if (NUM_REQS > 1) begin - - wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel; - wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel; - wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel; - wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel; - wire core_req_rw_in_sel; - wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in; for (genvar i = 0; i < NUM_REQS; ++i) begin assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]}; end assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_tid]; - - assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel; - assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH]; - - for (genvar i = 0; i < P; ++i) begin - assign mem_req_data_out[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] = mem_req_valid_in ? - mem_req_data_in[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] : core_req_data_in_sel; - end - - if (D != 0) begin - wire [D-1:0] req_addr_idx = core_req_addr_in_sel[D-1:0]; - reg [MEM_DATA_SIZE-1:0] mem_req_byteen_in_r; - always @(*) begin - mem_req_byteen_in_r = 0; - mem_req_byteen_in_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in_sel; - end - assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r; - assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel}); - end else begin - assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel; - assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, core_req_tag_in_sel}); - end end else begin - `UNUSED_VAR (core_req_nc_tid) + assign core_req_tag_in_sel = core_req_tag_in; + assign core_req_data_in_sel = core_req_data_in; + assign core_req_byteen_in_sel = core_req_byteen_in; + assign core_req_addr_in_sel = core_req_addr_in; + assign core_req_rw_in_sel = core_req_rw_in; + end + + assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel; + assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH]; + + if (D != 0) begin + reg [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r; + reg [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in_r; + reg [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r; + + wire [D-1:0] req_addr_idx = core_req_addr_in_sel[D-1:0]; - assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in; - assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in[0][D +: MEM_ADDR_WIDTH]; + always @(*) begin + mem_req_byteen_in_r = 0; + mem_req_byteen_in_r[0] = core_req_byteen_in_sel; - for (genvar i = 0; i < P; ++i) begin - assign mem_req_data_out[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] = mem_req_valid_in ? - mem_req_data_in[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] : core_req_data_in; + mem_req_wsel_in_r = 'x; + mem_req_wsel_in_r[0] = req_addr_idx; + + mem_req_data_in_r = 'x; + mem_req_data_in_r[0] = core_req_data_in_sel; end - if (D != 0) begin - wire [D-1:0] req_addr_idx = core_req_addr_in[0][D-1:0]; - reg [MEM_DATA_SIZE-1:0] mem_req_byteen_in_r; - always @(*) begin - mem_req_byteen_in_r = 0; - mem_req_byteen_in_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in; - end - assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r; - assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({req_addr_idx, core_req_tag_in}); - end else begin - assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in; - assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'(core_req_tag_in); - end + assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r; + assign mem_req_wsel_out = mem_req_valid_in ? mem_req_wsel_in : mem_req_wsel_in_r; + assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r; + assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel}); + end else begin + `UNUSED_VAR (mem_req_wsel_in) + assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel; + assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel; + assign mem_req_wsel_out = 0; + assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, core_req_tag_in_sel}); end // core response handling - wire [NUM_RSP_TAGS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out_unqual; + wire [NUM_RSP_TAGS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out_c; wire is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT]; @@ -236,7 +232,7 @@ module VX_nc_bypass #( ) core_rsp_tag_insert ( .data_in (core_rsp_tag_in[i]), .sel_in ('0), - .data_out (core_rsp_tag_out_unqual[i]) + .data_out (core_rsp_tag_out_c[i]) ); end @@ -262,14 +258,14 @@ module VX_nc_bypass #( for (genvar i = 0; i < NUM_REQS; ++i) begin assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? core_rsp_data_in[i] : mem_rsp_data_in; end - end + end for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_out_unqual[i] : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0]; + assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_out_c[i] : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0]; end end else begin assign core_rsp_valid_out = core_rsp_valid_in || is_mem_rsp_nc; - assign core_rsp_tag_out = core_rsp_valid_in ? core_rsp_tag_out_unqual : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0]; + assign core_rsp_tag_out = core_rsp_valid_in ? core_rsp_tag_out_c : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0]; assign core_rsp_ready_in = core_rsp_ready_out; if (NUM_REQS > 1) begin diff --git a/hw/rtl/cache/VX_tag_access.v b/hw/rtl/cache/VX_tag_access.v index 708220ae..b0b4226a 100644 --- a/hw/rtl/cache/VX_tag_access.v +++ b/hw/rtl/cache/VX_tag_access.v @@ -48,8 +48,6 @@ module VX_tag_access #( VX_sp_ram #( .DATAW (`TAG_SELECT_BITS + 1), .SIZE (`LINES_PER_BANK), - .INIT_ENABLE (1), - .INIT_VALUE (0), .NO_RWCHECK (1) ) tag_store ( .clk( clk),