From 4b7d871d626c63c6ca1a83372c5a7969187b63da Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 21 Dec 2020 03:53:13 -0800 Subject: [PATCH] allowing partial cache request submissions, io bus support broken --- hw/rtl/VX_cluster.v | 89 +------- hw/rtl/VX_config.vh | 4 +- hw/rtl/VX_core.v | 50 +--- hw/rtl/VX_databus_arb.v | 188 +++++++-------- hw/rtl/VX_dcache_arb.v | 156 ------------- hw/rtl/VX_fpu_unit.v | 2 +- hw/rtl/VX_lsu_unit.v | 110 +++++---- hw/rtl/VX_mem_unit.v | 26 +-- hw/rtl/VX_mul_unit.v | 2 +- hw/rtl/VX_pipeline.v | 6 +- hw/rtl/Vortex.v | 89 +------- hw/rtl/afu/vortex_afu.sv | 15 -- hw/rtl/cache/VX_bank.v | 59 ++--- hw/rtl/cache/VX_bank_core_req_queue.v | 215 ------------------ hw/rtl/cache/VX_cache.v | 101 ++++---- hw/rtl/cache/VX_cache_core_req_bank_sel.v | 87 +++---- hw/rtl/cache/VX_cache_core_rsp_merge.v | 1 - hw/rtl/cache/VX_miss_resrv.v | 2 +- hw/rtl/cache/VX_snp_forwarder.v | 2 +- hw/rtl/fp_cores/altera/acl_gen.log | 56 ++--- hw/rtl/fp_cores/altera/acl_gen.sh | 14 +- hw/rtl/interfaces/VX_cache_core_req_if.v | 14 +- .../{VX_cam_buffer.v => VX_index_buffer.v} | 14 +- hw/syn/quartus/top8/Makefile | 8 +- 24 files changed, 342 insertions(+), 968 deletions(-) delete mode 100644 hw/rtl/VX_dcache_arb.v delete mode 100644 hw/rtl/cache/VX_bank_core_req_queue.v rename hw/rtl/libs/{VX_cam_buffer.v => VX_index_buffer.v} (88%) diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 61a6a978..18cb2d7c 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -36,22 +36,7 @@ module VX_cluster #( output wire [`L2SNP_TAG_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready, - // I/O request - output wire [`NUM_THREADS-1:0] io_req_valid, - output wire io_req_rw, - output wire [`NUM_THREADS-1:0][3:0] io_req_byteen, - output wire [`NUM_THREADS-1:0][29:0] io_req_addr, - output wire [`NUM_THREADS-1:0][31:0] io_req_data, - output wire [`L2CORE_TAG_WIDTH-1:0] io_req_tag, - input wire io_req_ready, - - // I/O response - input wire io_rsp_valid, - input wire [31:0] io_rsp_data, - input wire [`L2CORE_TAG_WIDTH-1:0] io_rsp_tag, - output wire io_rsp_ready, - - // CSR I/O Request + // CSR Request input wire csr_io_req_valid, input wire [`NC_BITS-1:0] csr_io_req_coreid, input wire [11:0] csr_io_req_addr, @@ -59,7 +44,7 @@ module VX_cluster #( input wire [31:0] csr_io_req_data, output wire csr_io_req_ready, - // CSR I/O Response + // CSR Response output wire csr_io_rsp_valid, output wire [31:0] csr_io_rsp_data, input wire csr_io_rsp_ready, @@ -91,19 +76,6 @@ module VX_cluster #( wire [`NUM_CORES-1:0][`DSNP_TAG_WIDTH-1:0] per_core_snp_rsp_tag; wire [`NUM_CORES-1:0] per_core_snp_rsp_ready; - wire [`NUM_CORES-1:0][`NUM_THREADS-1:0] per_core_io_req_valid; - wire [`NUM_CORES-1:0] per_core_io_req_rw; - wire [`NUM_CORES-1:0][`NUM_THREADS-1:0][3:0] per_core_io_req_byteen; - wire [`NUM_CORES-1:0][`NUM_THREADS-1:0][29:0] per_core_io_req_addr; - wire [`NUM_CORES-1:0][`NUM_THREADS-1:0][31:0] per_core_io_req_data; - wire [`NUM_CORES-1:0][`DCORE_TAG_WIDTH-1:0] per_core_io_req_tag; - wire [`NUM_CORES-1:0] per_core_io_req_ready; - - wire [`NUM_CORES-1:0] per_core_io_rsp_valid; - wire [`NUM_CORES-1:0][`DCORE_TAG_WIDTH-1:0] per_core_io_rsp_tag; - wire [`NUM_CORES-1:0][31:0] per_core_io_rsp_data; - wire [`NUM_CORES-1:0] per_core_io_rsp_ready; - wire [`NUM_CORES-1:0] per_core_csr_io_req_valid; wire [`NUM_CORES-1:0][11:0] per_core_csr_io_req_addr; wire [`NUM_CORES-1:0] per_core_csr_io_req_rw; @@ -149,19 +121,6 @@ module VX_cluster #( .snp_rsp_tag (per_core_snp_rsp_tag [i]), .snp_rsp_ready (per_core_snp_rsp_ready [i]), - .io_req_valid (per_core_io_req_valid [i]), - .io_req_rw (per_core_io_req_rw [i]), - .io_req_byteen (per_core_io_req_byteen [i]), - .io_req_addr (per_core_io_req_addr [i]), - .io_req_data (per_core_io_req_data [i]), - .io_req_tag (per_core_io_req_tag [i]), - .io_req_ready (per_core_io_req_ready [i]), - - .io_rsp_valid (per_core_io_rsp_valid [i]), - .io_rsp_data (per_core_io_rsp_data [i]), - .io_rsp_tag (per_core_io_rsp_tag [i]), - .io_rsp_ready (per_core_io_rsp_ready [i]), - .csr_io_req_valid (per_core_csr_io_req_valid[i]), .csr_io_req_rw (per_core_csr_io_req_rw [i]), .csr_io_req_addr (per_core_csr_io_req_addr [i]), @@ -175,49 +134,7 @@ module VX_cluster #( .busy (per_core_busy [i]), .ebreak (per_core_ebreak [i]) ); - end - - VX_databus_arb #( - .NUM_REQS (`NUM_CORES), - .WORD_SIZE (4), - .TAG_IN_WIDTH (`DCORE_TAG_WIDTH), - .TAG_OUT_WIDTH (`L2CORE_TAG_WIDTH), - .BUFFERED_REQ (`NUM_CORES >= 4), - .BUFFERED_RSP (1) - ) io_arb ( - .clk (clk), - .reset (reset), - - // input requests - .req_valid_in (per_core_io_req_valid), - .req_rw_in (per_core_io_req_rw), - .req_byteen_in (per_core_io_req_byteen), - .req_addr_in (per_core_io_req_addr), - .req_data_in (per_core_io_req_data), - .req_tag_in (per_core_io_req_tag), - .req_ready_in (per_core_io_req_ready), - - // output request - .req_valid_out (io_req_valid), - .req_rw_out (io_req_rw), - .req_byteen_out (io_req_byteen), - .req_addr_out (io_req_addr), - .req_data_out (io_req_data), - .req_tag_out (io_req_tag), - .req_ready_out (io_req_ready), - - // input response - .rsp_valid_in (io_rsp_valid), - .rsp_tag_in (io_rsp_tag), - .rsp_data_in (io_rsp_data), - .rsp_ready_in (io_rsp_ready), - - // output responses - .rsp_valid_out (per_core_io_rsp_valid), - .rsp_data_out (per_core_io_rsp_data), - .rsp_tag_out (per_core_io_rsp_tag), - .rsp_ready_out (per_core_io_rsp_ready) - ); + end VX_csr_io_arb #( .NUM_REQS (`NUM_CORES), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index a1b13b38..d3c7664a 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -289,7 +289,7 @@ // Size of cache in bytes `ifndef DCACHE_SIZE -`define DCACHE_SIZE 4096 +`define DCACHE_SIZE 8192 `endif // Number of banks @@ -336,7 +336,7 @@ // Size of cache in bytes `ifndef SMEM_SIZE -`define SMEM_SIZE 2048 +`define SMEM_SIZE 4096 `endif // Number of banks diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index 79d5ce9d..585c3fc9 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -35,21 +35,6 @@ module VX_core #( output wire [`DSNP_TAG_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready, - // I/O request - output wire [`NUM_THREADS-1:0] io_req_valid, - output wire io_req_rw, - output wire [`NUM_THREADS-1:0][3:0] io_req_byteen, - output wire [`NUM_THREADS-1:0][29:0] io_req_addr, - output wire [`NUM_THREADS-1:0][31:0] io_req_data, - output wire [`DCORE_TAG_WIDTH-1:0] io_req_tag, - input wire io_req_ready, - - // I/O response - input wire io_rsp_valid, - input wire [31:0] io_rsp_data, - input wire [`DCORE_TAG_WIDTH-1:0] io_rsp_tag, - output wire io_rsp_ready, - // CSR I/O request input wire csr_io_req_valid, input wire [11:0] csr_io_req_addr, @@ -117,35 +102,6 @@ module VX_core #( //-- - VX_cache_core_req_if #( - .NUM_REQS(`DNUM_REQUESTS), - .WORD_SIZE(`DWORD_SIZE), - .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH), - .CORE_TAG_ID_BITS(`DCORE_TAG_ID_BITS) - ) io_req_if(); - - VX_cache_core_rsp_if #( - .NUM_REQS(`DNUM_REQUESTS), - .WORD_SIZE(`DWORD_SIZE), - .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH), - .CORE_TAG_ID_BITS(`DCORE_TAG_ID_BITS) - ) io_rsp_if(); - - assign io_req_valid = io_req_if.valid; - assign io_req_rw = io_req_if.rw; - assign io_req_byteen = io_req_if.byteen; - assign io_req_addr = io_req_if.addr; - assign io_req_data = io_req_if.data; - assign io_req_tag = io_req_if.tag; - assign io_req_if.ready = io_req_ready; - - assign io_rsp_if.valid = {{(`NUM_THREADS-1){1'b0}}, io_rsp_valid}; - assign io_rsp_if.data[0] = io_rsp_data; - assign io_rsp_if.tag = io_rsp_tag; - assign io_rsp_ready = io_rsp_if.ready; - - //-- - VX_cache_core_req_if #( .NUM_REQS(`DNUM_REQUESTS), .WORD_SIZE(`DWORD_SIZE), @@ -259,11 +215,7 @@ module VX_core #( // DRAM .dram_req_if (dram_req_if), - .dram_rsp_if (dram_rsp_if), - - // I/O - .io_req_if (io_req_if), - .io_rsp_if (io_rsp_if) + .dram_rsp_if (dram_rsp_if) ); endmodule diff --git a/hw/rtl/VX_databus_arb.v b/hw/rtl/VX_databus_arb.v index 6830d0f2..3bea8ea6 100644 --- a/hw/rtl/VX_databus_arb.v +++ b/hw/rtl/VX_databus_arb.v @@ -1,125 +1,107 @@ `include "VX_define.vh" -module VX_databus_arb #( - parameter NUM_REQS = 1, - parameter WORD_SIZE = 1, - parameter TAG_IN_WIDTH = 1, - parameter TAG_OUT_WIDTH = 1, - parameter BUFFERED_REQ = 0, - parameter BUFFERED_RSP = 0, +module VX_databus_arb ( + input wire clk, + input wire reset, - parameter WORD_WIDTH = WORD_SIZE * 8, - parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE), - parameter LOG_NUM_REQS = `CLOG2(NUM_REQS) -) ( - input wire clk, - input wire reset, + // input request + VX_cache_core_req_if core_req_if, - // input requests - input wire [NUM_REQS-1:0][`NUM_THREADS-1:0] req_valid_in, - input wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] req_tag_in, - input wire [NUM_REQS-1:0][`NUM_THREADS-1:0][ADDR_WIDTH-1:0] req_addr_in, - input wire [NUM_REQS-1:0] req_rw_in, - input wire [NUM_REQS-1:0][`NUM_THREADS-1:0][WORD_SIZE-1:0] req_byteen_in, - input wire [NUM_REQS-1:0][`NUM_THREADS-1:0][WORD_WIDTH-1:0] req_data_in, - output wire [NUM_REQS-1:0] req_ready_in, + // output requests + VX_cache_core_req_if cache_req_if, + VX_cache_core_req_if smem_req_if, - // output request - output wire [`NUM_THREADS-1:0] req_valid_out, - output wire [TAG_OUT_WIDTH-1:0] req_tag_out, - output wire [`NUM_THREADS-1:0][ADDR_WIDTH-1:0] req_addr_out, - output wire req_rw_out, - output wire [`NUM_THREADS-1:0][WORD_SIZE-1:0] req_byteen_out, - output wire [`NUM_THREADS-1:0][WORD_WIDTH-1:0] req_data_out, - input wire req_ready_out, + // input responses + VX_cache_core_rsp_if cache_rsp_if, + VX_cache_core_rsp_if smem_rsp_if, - // input response - input wire rsp_valid_in, - input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in, - input wire [WORD_WIDTH-1:0] rsp_data_in, - output wire rsp_ready_in, - - // output responses - output wire [NUM_REQS-1:0] rsp_valid_out, - output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out, - output wire [NUM_REQS-1:0][WORD_WIDTH-1:0] rsp_data_out, - input wire [NUM_REQS-1:0] rsp_ready_out + // output response + VX_cache_core_rsp_if core_rsp_if ); - localparam REQ_DATAW = `NUM_THREADS + TAG_OUT_WIDTH + (`NUM_THREADS * ADDR_WIDTH) + 1 + (`NUM_THREADS * WORD_SIZE) + (`NUM_THREADS * WORD_WIDTH); - localparam RSP_DATAW = TAG_IN_WIDTH + WORD_WIDTH; + localparam REQ_ADDRW = 32 - `CLOG2(`DWORD_SIZE); + localparam REQ_DATAW = REQ_ADDRW + 1 + `DWORD_SIZE + (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; + localparam RSP_DATAW = `NUM_THREADS + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; - if (NUM_REQS > 1) begin + // + // handle requests + // - wire [NUM_REQS-1:0] valids; - wire [NUM_REQS-1:0][REQ_DATAW-1:0] data_in; - wire [`NUM_THREADS-1:0] req_tmask_out; - wire req_valid_out_unqual; + for (genvar i = 0; i < `NUM_THREADS; ++i) begin - for (genvar i = 0; i < NUM_REQS; i++) begin - assign valids[i] = (| req_valid_in[i]); - assign data_in[i] = {req_valid_in[i], {req_tag_in[i], LOG_NUM_REQS'(i)}, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; - end + wire cache_req_ready_in; + wire smem_req_ready_in; - VX_stream_arbiter #( - .NUM_REQS (NUM_REQS), - .DATAW (REQ_DATAW), - .BUFFERED (BUFFERED_REQ) - ) req_arb ( + // select shared memory bus + wire is_smem_addr = core_req_if.valid[i] && `SM_ENABLE + && (core_req_if.addr[i] >= REQ_ADDRW'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> 2)) + && (core_req_if.addr[i] < REQ_ADDRW'(`SHARED_MEM_BASE_ADDR >> 2)); + + VX_skid_buffer #( + .DATAW (REQ_DATAW), + .PASSTHRU (1) + ) cache_out_buffer ( .clk (clk), .reset (reset), - .valid_in (valids), - .data_in (data_in), - .ready_in (req_ready_in), - .valid_out (req_valid_out_unqual), - .data_out ({req_tmask_out, req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}), - .ready_out (req_ready_out) + .valid_in (core_req_if.valid[i] && !is_smem_addr), + .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), + .ready_in (cache_req_ready_in), + .valid_out (cache_req_if.valid[i]), + .data_out ({cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), + .ready_out (cache_req_if.ready[i]) ); - assign req_valid_out = {`NUM_THREADS{req_valid_out_unqual}} & req_tmask_out; - - /////////////////////////////////////////////////////////////////////// - - wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[LOG_NUM_REQS-1:0]; - - wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_merged_data_out; - for (genvar i = 0; i < NUM_REQS; i++) begin - assign {rsp_tag_out[i], rsp_data_out[i]} = rsp_merged_data_out[i]; - end - - VX_stream_demux #( - .NUM_REQS (NUM_REQS), - .DATAW (RSP_DATAW), - .BUFFERED (BUFFERED_RSP) - ) rsp_demux ( + VX_skid_buffer #( + .DATAW (REQ_DATAW), + .PASSTHRU (1) + ) smem_out_buffer ( .clk (clk), .reset (reset), - .sel (rsp_sel), - .valid_in (rsp_valid_in), - .data_in ({rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH], rsp_data_in}), - .ready_in (rsp_ready_in), - .valid_out (rsp_valid_out), - .data_out (rsp_merged_data_out), - .ready_out (rsp_ready_out) + .valid_in (core_req_if.valid[i] && is_smem_addr), + .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), + .ready_in (smem_req_ready_in), + .valid_out (smem_req_if.valid[i]), + .data_out ({smem_req_if.addr[i], smem_req_if.rw[i], smem_req_if.byteen[i], smem_req_if.data[i], smem_req_if.tag[i]}), + .ready_out (smem_req_if.ready[i]) ); - - end else begin - - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) - - assign req_valid_out = req_valid_in; - assign req_tag_out = req_tag_in; - assign req_addr_out = req_addr_in; - assign req_rw_out = req_rw_in; - assign req_byteen_out = req_byteen_in; - assign req_data_out = req_data_in; - assign req_ready_in = req_ready_out; - - assign rsp_valid_out = rsp_valid_in; - assign rsp_tag_out = rsp_tag_in; - assign rsp_data_out = rsp_data_in; - assign rsp_ready_in = rsp_ready_out; + assign core_req_if.ready[i] = is_smem_addr ? smem_req_ready_in : cache_req_ready_in; end + // + // handle responses + // + + wire [1:0][RSP_DATAW-1:0] rsp_data_in; + wire [1:0] rsp_valid_in; + wire [1:0] rsp_ready_in; + + wire core_rsp_valid; + wire [`NUM_THREADS-1:0] core_rsp_tmask; + + assign rsp_data_in[0] = {cache_rsp_if.valid, cache_rsp_if.data, cache_rsp_if.tag}; + assign rsp_data_in[1] = {smem_rsp_if.valid, smem_rsp_if.data, smem_rsp_if.tag}; + + assign rsp_valid_in[0] = (| cache_rsp_if.valid); + assign rsp_valid_in[1] = (| smem_rsp_if.valid) & `SM_ENABLE; + + VX_stream_arbiter #( + .NUM_REQS (2), + .DATAW (RSP_DATAW), + .BUFFERED (0) + ) rsp_arb ( + .clk (clk), + .reset (reset), + .valid_in (rsp_valid_in), + .data_in (rsp_data_in), + .ready_in (rsp_ready_in), + .valid_out (core_rsp_valid), + .data_out ({core_rsp_tmask, core_rsp_if.data, core_rsp_if.tag}), + .ready_out (core_rsp_if.ready) + ); + + assign cache_rsp_if.ready = rsp_ready_in[0]; + assign smem_rsp_if.ready = rsp_ready_in[1]; + + assign core_rsp_if.valid = core_rsp_tmask & {`NUM_THREADS{core_rsp_valid}}; + endmodule \ No newline at end of file diff --git a/hw/rtl/VX_dcache_arb.v b/hw/rtl/VX_dcache_arb.v deleted file mode 100644 index 1926ceb7..00000000 --- a/hw/rtl/VX_dcache_arb.v +++ /dev/null @@ -1,156 +0,0 @@ -`include "VX_define.vh" - -module VX_dcache_arb ( - input wire clk, - input wire reset, - - // input request - VX_cache_core_req_if core_req_if, - - // output requests - VX_cache_core_req_if cache_req_if, - VX_cache_core_req_if smem_req_if, - VX_cache_core_req_if io_req_if, - - // input responses - VX_cache_core_rsp_if cache_rsp_if, - VX_cache_core_rsp_if smem_rsp_if, - VX_cache_core_rsp_if io_rsp_if, - - // output response - VX_cache_core_rsp_if core_rsp_if -); - localparam REQ_ADDRW = 32 - `CLOG2(`DWORD_SIZE); - localparam REQ_DATAW = `NUM_THREADS + 1 + `NUM_THREADS * `DWORD_SIZE + `NUM_THREADS * REQ_ADDRW + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; - localparam RSP_DATAW = `NUM_THREADS + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; - - // - // select request - // - - // select shared memory bus - wire is_smem_addr = core_req_if.valid[0] && `SM_ENABLE - && (core_req_if.addr[0] >= REQ_ADDRW'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> 2)) - && (core_req_if.addr[0] < REQ_ADDRW'(`SHARED_MEM_BASE_ADDR >> 2)); - - // select io bus - wire is_io_addr = core_req_if.valid[0] - && (core_req_if.addr[0] >= REQ_ADDRW'(`IO_BUS_BASE_ADDR >> 2)); - - wire cache_req_valid_out; - wire [`NUM_THREADS-1:0] cache_req_tmask; - wire cache_req_ready_in; - - wire smem_req_valid_out; - wire [`NUM_THREADS-1:0] smem_req_tmask; - wire smem_req_ready_in; - - wire io_req_valid_out; - wire [`NUM_THREADS-1:0] io_req_tmask; - wire io_req_ready_in; - - reg [2:0] req_select; - reg req_ready; - - VX_skid_buffer #( - .DATAW (REQ_DATAW) - ) cache_out_buffer ( - .clk (clk), - .reset (reset), - .valid_in (req_select[0]), - .data_in ({core_req_if.valid, core_req_if.addr, core_req_if.rw, core_req_if.byteen, core_req_if.data, core_req_if.tag}), - .ready_in (cache_req_ready_in), - .valid_out (cache_req_valid_out), - .data_out ({cache_req_tmask, cache_req_if.addr, cache_req_if.rw, cache_req_if.byteen, cache_req_if.data, cache_req_if.tag}), - .ready_out (cache_req_if.ready) - ); - - assign cache_req_if.valid = cache_req_tmask & {`NUM_THREADS{cache_req_valid_out}}; - - VX_skid_buffer #( - .DATAW (REQ_DATAW) - ) smem_out_buffer ( - .clk (clk), - .reset (reset), - .valid_in (req_select[1]), - .data_in ({core_req_if.valid, core_req_if.addr, core_req_if.rw, core_req_if.byteen, core_req_if.data, core_req_if.tag}), - .ready_in (smem_req_ready_in), - .valid_out (smem_req_valid_out), - .data_out ({smem_req_tmask, smem_req_if.addr, smem_req_if.rw, smem_req_if.byteen, smem_req_if.data, smem_req_if.tag}), - .ready_out (smem_req_if.ready) - ); - - assign smem_req_if.valid = smem_req_tmask & {`NUM_THREADS{smem_req_valid_out}}; - - VX_skid_buffer #( - .DATAW (REQ_DATAW) - ) io_out_buffer ( - .clk (clk), - .reset (reset), - .valid_in (req_select[2]), - .data_in ({core_req_if.valid, core_req_if.addr, core_req_if.rw, core_req_if.byteen, core_req_if.data, core_req_if.tag}), - .ready_in (io_req_ready_in), - .valid_out (io_req_valid_out), - .data_out ({io_req_tmask, io_req_if.addr, io_req_if.rw, io_req_if.byteen, io_req_if.data, io_req_if.tag}), - .ready_out (io_req_if.ready) - ); - - assign io_req_if.valid = io_req_tmask & {`NUM_THREADS{io_req_valid_out}}; - - always @(*) begin - req_select = 0; - if (is_smem_addr) begin - req_select[1] = 1; - req_ready = smem_req_ready_in; - end else if (is_io_addr) begin - req_select[2] = 1; - req_ready = io_req_ready_in; - end else begin - req_select[0] = 1; - req_ready = cache_req_ready_in; - end - end - - assign core_req_if.ready = req_ready; - - // - // select response - // - - wire [2:0][RSP_DATAW-1:0] rsp_data_in; - wire [2:0] rsp_valid_in; - wire [2:0] rsp_ready_in; - - wire core_rsp_valid; - wire [`NUM_THREADS-1:0] core_rsp_tmask; - - assign rsp_data_in[0] = {cache_rsp_if.valid, cache_rsp_if.data, cache_rsp_if.tag}; - assign rsp_data_in[1] = {smem_rsp_if.valid, smem_rsp_if.data, smem_rsp_if.tag}; - assign rsp_data_in[2] = {io_rsp_if.valid, io_rsp_if.data, io_rsp_if.tag}; - - assign rsp_valid_in[0] = (| cache_rsp_if.valid); - assign rsp_valid_in[1] = (| smem_rsp_if.valid) & `SM_ENABLE; - assign rsp_valid_in[2] = (| io_rsp_if.valid); - - VX_stream_arbiter #( - .NUM_REQS (3), - .DATAW (RSP_DATAW), - .BUFFERED (1) - ) rsp_arb ( - .clk (clk), - .reset (reset), - .valid_in (rsp_valid_in), - .data_in (rsp_data_in), - .ready_in (rsp_ready_in), - .valid_out (core_rsp_valid), - .data_out ({core_rsp_tmask, core_rsp_if.data, core_rsp_if.tag}), - .ready_out (core_rsp_if.ready) - ); - - assign cache_rsp_if.ready = rsp_ready_in[0]; - assign smem_rsp_if.ready = rsp_ready_in[1]; - assign io_rsp_if.ready = rsp_ready_in[2]; - - assign core_rsp_if.valid = core_rsp_tmask & {`NUM_THREADS{core_rsp_valid}}; - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index 2604bce1..712f0da2 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -39,7 +39,7 @@ module VX_fpu_unit #( wire fpuq_push = fpu_req_if.valid && fpu_req_if.ready; wire fpuq_pop = valid_out && ready_out; - VX_cam_buffer #( + VX_index_buffer #( .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), .SIZE (`FPUQ_SIZE), .FASTRAM (1) diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index ecacd1e2..f41872cd 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -72,7 +72,8 @@ module VX_lsu_unit #( reg [`LSUQ_SIZE-1:0][`DCORE_TAG_WIDTH-1:0] pending_tags; `IGNORE_WARNINGS_END - wire stall_in; + wire ready_in; + wire stall_in = ~ready_in & req_valid; VX_generic_register #( .N(1 + `NW_BITS + `NUM_THREADS + 32 + 1 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 2 + (`NUM_THREADS * (30 + 2 + 4 + 32))), @@ -86,79 +87,98 @@ module VX_lsu_unit #( .data_out ({req_valid, req_wid, req_tmask, req_pc, req_rw, req_rd, req_wb, req_address, req_sext, req_addr, req_offset, req_byteen, req_data}) ); + // Can accept new request? + assign lsu_req_if.ready = ~stall_in; + wire [`NW_BITS-1:0] rsp_wid; wire [31:0] rsp_pc; wire [`NR_BITS-1:0] rsp_rd; wire rsp_wb; wire [`NUM_THREADS-1:0][1:0] rsp_offset; wire [1:0] rsp_sext; + reg [`NUM_THREADS-1:0][31:0] rsp_data; + reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; + reg [`NUM_THREADS-1:0] req_sent_mask, rsp_rem_mask_n; + wire req_sent_all; - reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] mem_rsp_mask; + wire [`DCORE_TAG_ID_BITS-1:0] mbuf_waddr, mbuf_raddr; + wire mbuf_full; - wire [`DCORE_TAG_ID_BITS-1:0] req_tag, rsp_tag; - wire lsuq_full; + wire mbuf_push = (| dcache_req_if.valid) && (| dcache_req_if.ready) + && (0 == req_sent_mask) // first submission only + && (0 == req_rw); // loads only - wire lsuq_push = (| dcache_req_if.valid) && dcache_req_if.ready - && (0 == req_rw); // loads only + wire mbuf_pop_part = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; - wire lsuq_pop_part = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; + wire mbuf_pop = mbuf_pop_part && (0 == rsp_rem_mask_n); - assign rsp_tag = dcache_rsp_if.tag[0][`DCORE_TAG_ID_BITS-1:0]; + assign mbuf_raddr = dcache_rsp_if.tag[0][`DCORE_TAG_ID_BITS-1:0]; - wire [`NUM_THREADS-1:0] mem_rsp_mask_n = mem_rsp_mask[rsp_tag] & ~dcache_rsp_if.valid; - - wire lsuq_pop = lsuq_pop_part && (0 == mem_rsp_mask_n); - - VX_cam_buffer #( + VX_index_buffer #( .DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2), .SIZE (`LSUQ_SIZE), .FASTRAM (1) ) req_metadata_buf ( .clk (clk), .reset (reset), - .write_addr (req_tag), - .acquire_slot (lsuq_push), - .read_addr (rsp_tag), + .write_addr (mbuf_waddr), + .acquire_slot (mbuf_push), + .read_addr (mbuf_raddr), .write_data ({req_wid, req_pc, req_rd, req_wb, req_offset, req_sext}), .read_data ({rsp_wid, rsp_pc, rsp_rd, rsp_wb, rsp_offset, rsp_sext}), - .release_addr (rsp_tag), - .release_slot (lsuq_pop), - .full (lsuq_full) + .release_addr (mbuf_raddr), + .release_slot (mbuf_pop), + .full (mbuf_full) ); + assign req_sent_all = ((dcache_req_if.ready | req_sent_mask) & req_tmask) == req_tmask; always @(posedge clk) begin - if (lsuq_push) begin - mem_rsp_mask[req_tag] <= req_tmask; - pending_tags[req_tag] <= dcache_req_if.tag; + if (reset) begin + req_sent_mask <= 0; + end else begin + if (req_sent_all) + req_sent_mask <= 0; + else + req_sent_mask <= req_sent_mask | (dcache_req_if.valid & dcache_req_if.ready); + end + end + + // need to hold the acquired tag index until the full request is submitted + reg [`DCORE_TAG_ID_BITS-1:0] req_tag_hold; + wire [`DCORE_TAG_ID_BITS-1:0] req_tag = (0 == req_sent_mask) ? mbuf_waddr : req_tag_hold; + always @(posedge clk) begin + if (mbuf_push) + req_tag_hold <= mbuf_waddr; + end + + assign rsp_rem_mask_n = rsp_rem_mask[mbuf_raddr] & ~dcache_rsp_if.valid; + always @(posedge clk) begin + if (mbuf_push) begin + rsp_rem_mask[mbuf_waddr] <= req_tmask; + pending_tags[mbuf_waddr] <= dcache_req_if.tag[0]; end - if (lsuq_pop_part) begin - mem_rsp_mask[rsp_tag] <= mem_rsp_mask_n; + if (mbuf_pop_part) begin + rsp_rem_mask[mbuf_raddr] <= rsp_rem_mask_n; end end - wire load_req_stall = req_valid && !req_rw && lsuq_full; - wire store_req_stall = req_valid && req_rw && !st_commit_if.ready; + wire req_ready_dep = (!req_rw && !mbuf_full) || (req_rw && st_commit_if.ready); // Core Request - assign dcache_req_if.valid = {`NUM_THREADS{req_valid && !load_req_stall && !store_req_stall}} & req_tmask; - assign dcache_req_if.rw = req_rw; + assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_ready_dep}} & req_tmask & ~req_sent_mask; + assign dcache_req_if.rw = {`NUM_THREADS{req_rw}}; assign dcache_req_if.byteen = req_byteen; assign dcache_req_if.addr = req_addr; assign dcache_req_if.data = req_data; `ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag = {req_pc, req_rd, req_wid, req_tag}; + assign dcache_req_if.tag = {`NUM_THREADS{{req_pc, req_rd, req_wid, req_tag}}}; `else - assign dcache_req_if.tag = req_tag; + assign dcache_req_if.tag = {`NUM_THREADS{req_tag}}; `endif - - assign stall_in = ~dcache_req_if.ready - || load_req_stall - || store_req_stall; - - // Can accept new request? - assign lsu_req_if.ready = ~stall_in; + + assign ready_in = req_ready_dep && req_sent_all; // Core Response for (genvar i = 0; i < `NUM_THREADS; i++) begin @@ -174,7 +194,7 @@ module VX_lsu_unit #( // send store commit - wire is_store_rsp = req_valid && req_rw && dcache_req_if.ready; + wire is_store_rsp = req_valid && req_rw && req_sent_all; assign st_commit_if.valid = is_store_rsp; assign st_commit_if.wid = req_wid; @@ -206,7 +226,7 @@ module VX_lsu_unit #( assign dcache_rsp_if.ready = ~load_rsp_stall; // scope registration - `SCOPE_ASSIGN (dcache_req_fire, dcache_req_if.valid & {`NUM_THREADS{dcache_req_if.ready}}); + `SCOPE_ASSIGN (dcache_req_fire, dcache_req_if.valid & dcache_req_if.ready); `SCOPE_ASSIGN (dcache_req_wid, req_wid); `SCOPE_ASSIGN (dcache_req_pc, req_pc); `SCOPE_ASSIGN (dcache_req_addr, req_address); @@ -216,23 +236,23 @@ module VX_lsu_unit #( `SCOPE_ASSIGN (dcache_req_tag, req_tag); `SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.valid & {`NUM_THREADS{dcache_rsp_if.ready}}); `SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data); - `SCOPE_ASSIGN (dcache_rsp_tag, rsp_tag); + `SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr); `ifdef DBG_PRINT_CORE_DCACHE always @(posedge clk) begin - if ((| dcache_req_if.valid) && dcache_req_if.ready) begin - if (dcache_req_if.rw) + if ((| dcache_req_if.valid) && (|dcache_req_if.ready)) begin + if (dcache_req_if.rw[0]) $display("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, data=%0h", - $time, CORE_ID, req_wid, req_pc, dcache_req_if.valid, req_address, dcache_req_if.tag, dcache_req_if.byteen, dcache_req_if.data); + $time, CORE_ID, req_wid, req_pc, (dcache_req_if.valid & dcache_req_if.ready), req_address, dcache_req_if.tag, dcache_req_if.byteen, dcache_req_if.data); else $display("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, rd=%0d", - $time, CORE_ID, req_wid, req_pc, dcache_req_if.valid, req_address, dcache_req_if.tag, dcache_req_if.byteen, req_rd); + $time, CORE_ID, req_wid, req_pc, (dcache_req_if.valid & dcache_req_if.ready), req_address, dcache_req_if.tag, dcache_req_if.byteen, req_rd); end if ((| dcache_rsp_if.valid) && dcache_rsp_if.ready) begin $display("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h", $time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd, dcache_rsp_if.data); end - if (lsuq_full) begin + if (mbuf_full) begin $write("%t: D$%0d queue-full:", $time, CORE_ID); for (integer j = 0; j < `LSUQ_SIZE; j++) begin $write(" tag%0d=%0h", j, pending_tags[j]); diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index f4faa31a..d500b6bc 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -26,11 +26,7 @@ module VX_mem_unit # ( // DRAM VX_cache_dram_req_if dram_req_if, - VX_cache_dram_rsp_if dram_rsp_if, - - // I/O - VX_cache_core_req_if io_req_if, - VX_cache_core_rsp_if io_rsp_if + VX_cache_dram_rsp_if dram_rsp_if ); `ifdef PERF_ENABLE @@ -76,19 +72,17 @@ module VX_mem_unit # ( .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS) ) smem_rsp_if(); - VX_dcache_arb dcache_arb ( - .clk (clk), - .reset (reset), + VX_databus_arb databus_arb ( + .clk (clk), + .reset (reset), - .core_req_if (core_dcache_req_if), - .cache_req_if (dcache_req_if), - .smem_req_if (smem_req_if), - .io_req_if (io_req_if), + .core_req_if (core_dcache_req_if), + .cache_req_if (dcache_req_if), + .smem_req_if (smem_req_if), - .cache_rsp_if (dcache_rsp_if), - .smem_rsp_if (smem_rsp_if), - .io_rsp_if (io_rsp_if), - .core_rsp_if (core_dcache_rsp_if) + .cache_rsp_if (dcache_rsp_if), + .smem_rsp_if (smem_rsp_if), + .core_rsp_if (core_dcache_rsp_if) ); VX_cache #( diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index b0b9cb7c..a1651aeb 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -31,7 +31,7 @@ module VX_mul_unit #( wire mulq_push = mul_req_if.valid && mul_req_if.ready; wire mulq_pop = valid_out && ready_out; - VX_cam_buffer #( + VX_index_buffer #( .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), .SIZE (`MULQ_SIZE), .FASTRAM (1) diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index b7072e7c..8d61f42e 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -11,12 +11,12 @@ module VX_pipeline #( // Dcache core request output wire [`NUM_THREADS-1:0] dcache_req_valid, - output wire dcache_req_rw, + output wire [`NUM_THREADS-1:0] dcache_req_rw, output wire [`NUM_THREADS-1:0][3:0] dcache_req_byteen, output wire [`NUM_THREADS-1:0][29:0] dcache_req_addr, output wire [`NUM_THREADS-1:0][31:0] dcache_req_data, - output wire [`DCORE_TAG_WIDTH-1:0] dcache_req_tag, - input wire dcache_req_ready, + output wire [`NUM_THREADS-1:0][`DCORE_TAG_WIDTH-1:0] dcache_req_tag, + input wire [`NUM_THREADS-1:0] dcache_req_ready, // Dcache core reponse input wire [`NUM_THREADS-1:0] dcache_rsp_valid, diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index dbac8308..92a33ef9 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -32,24 +32,9 @@ module Vortex ( // Snoop response output wire snp_rsp_valid, output wire [`VX_SNP_TAG_WIDTH-1:0] snp_rsp_tag, - input wire snp_rsp_ready, + input wire snp_rsp_ready, - // I/O request - output wire [`NUM_THREADS-1:0] io_req_valid, - output wire io_req_rw, - output wire [`NUM_THREADS-1:0][3:0] io_req_byteen, - output wire [`NUM_THREADS-1:0][29:0] io_req_addr, - output wire [`NUM_THREADS-1:0][31:0] io_req_data, - output wire [`VX_CORE_TAG_WIDTH-1:0] io_req_tag, - input wire io_req_ready, - - // I/O response - input wire io_rsp_valid, - input wire [31:0] io_rsp_data, - input wire [`VX_CORE_TAG_WIDTH-1:0] io_rsp_tag, - output wire io_rsp_ready, - - // CSR I/O Request + // CSR Request input wire csr_io_req_valid, input wire [`VX_CSR_ID_WIDTH-1:0] csr_io_req_coreid, input wire [11:0] csr_io_req_addr, @@ -57,7 +42,7 @@ module Vortex ( input wire [31:0] csr_io_req_data, output wire csr_io_req_ready, - // CSR I/O Response + // CSR Response output wire csr_io_rsp_valid, output wire [31:0] csr_io_rsp_data, input wire csr_io_rsp_ready, @@ -89,19 +74,6 @@ module Vortex ( wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] per_cluster_snp_rsp_tag; wire [`NUM_CLUSTERS-1:0] per_cluster_snp_rsp_ready; - wire [`NUM_CLUSTERS-1:0][`NUM_THREADS-1:0] per_cluster_io_req_valid; - wire [`NUM_CLUSTERS-1:0] per_cluster_io_req_rw; - wire [`NUM_CLUSTERS-1:0][`NUM_THREADS-1:0][3:0] per_cluster_io_req_byteen; - wire [`NUM_CLUSTERS-1:0][`NUM_THREADS-1:0][29:0] per_cluster_io_req_addr; - wire [`NUM_CLUSTERS-1:0][`NUM_THREADS-1:0][31:0] per_cluster_io_req_data; - wire [`NUM_CLUSTERS-1:0][`L2CORE_TAG_WIDTH-1:0] per_cluster_io_req_tag; - wire [`NUM_CLUSTERS-1:0] per_cluster_io_req_ready; - - wire [`NUM_CLUSTERS-1:0] per_cluster_io_rsp_valid; - wire [`NUM_CLUSTERS-1:0][`L2CORE_TAG_WIDTH-1:0] per_cluster_io_rsp_tag; - wire [`NUM_CLUSTERS-1:0][31:0] per_cluster_io_rsp_data; - wire [`NUM_CLUSTERS-1:0] per_cluster_io_rsp_ready; - wire [`NUM_CLUSTERS-1:0] per_cluster_csr_io_req_valid; wire [`NUM_CLUSTERS-1:0][11:0] per_cluster_csr_io_req_addr; wire [`NUM_CLUSTERS-1:0] per_cluster_csr_io_req_rw; @@ -150,19 +122,6 @@ module Vortex ( .snp_rsp_tag (per_cluster_snp_rsp_tag [i]), .snp_rsp_ready (per_cluster_snp_rsp_ready [i]), - .io_req_valid (per_cluster_io_req_valid [i]), - .io_req_rw (per_cluster_io_req_rw [i]), - .io_req_byteen (per_cluster_io_req_byteen [i]), - .io_req_addr (per_cluster_io_req_addr [i]), - .io_req_data (per_cluster_io_req_data [i]), - .io_req_tag (per_cluster_io_req_tag [i]), - .io_req_ready (per_cluster_io_req_ready [i]), - - .io_rsp_valid (per_cluster_io_rsp_valid [i]), - .io_rsp_data (per_cluster_io_rsp_data [i]), - .io_rsp_tag (per_cluster_io_rsp_tag [i]), - .io_rsp_ready (per_cluster_io_rsp_ready [i]), - .csr_io_req_valid (per_cluster_csr_io_req_valid[i]), .csr_io_req_coreid (csr_io_core_id), .csr_io_req_rw (per_cluster_csr_io_req_rw [i]), @@ -179,48 +138,6 @@ module Vortex ( ); end - VX_databus_arb #( - .NUM_REQS (`NUM_CLUSTERS), - .WORD_SIZE (4), - .TAG_IN_WIDTH (`L2CORE_TAG_WIDTH), - .TAG_OUT_WIDTH (`L3CORE_TAG_WIDTH), - .BUFFERED_REQ (1), - .BUFFERED_RSP (`NUM_CLUSTERS >= 4) - ) io_arb ( - .clk (clk), - .reset (reset), - - // input requests - .req_valid_in (per_cluster_io_req_valid), - .req_rw_in (per_cluster_io_req_rw), - .req_byteen_in (per_cluster_io_req_byteen), - .req_addr_in (per_cluster_io_req_addr), - .req_data_in (per_cluster_io_req_data), - .req_tag_in (per_cluster_io_req_tag), - .req_ready_in (per_cluster_io_req_ready), - - // output request - .req_valid_out (io_req_valid), - .req_rw_out (io_req_rw), - .req_byteen_out (io_req_byteen), - .req_addr_out (io_req_addr), - .req_data_out (io_req_data), - .req_tag_out (io_req_tag), - .req_ready_out (io_req_ready), - - // input response - .rsp_valid_in (io_rsp_valid), - .rsp_tag_in (io_rsp_tag), - .rsp_data_in (io_rsp_data), - .rsp_ready_in (io_rsp_ready), - - // output responses - .rsp_valid_out (per_cluster_io_rsp_valid), - .rsp_data_out (per_cluster_io_rsp_data), - .rsp_tag_out (per_cluster_io_rsp_tag), - .rsp_ready_out (per_cluster_io_rsp_ready) - ); - VX_csr_io_arb #( .NUM_REQS (`NUM_CLUSTERS), .DATA_WIDTH (32), diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index 474a3cf3..46e7be0e 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -999,21 +999,6 @@ Vortex #() vortex ( .snp_rsp_tag (vx_snp_rsp_tag), .snp_rsp_ready (vx_snp_rsp_ready), - // I/O request - `UNUSED_PIN (io_req_valid), - `UNUSED_PIN (io_req_rw), - `UNUSED_PIN (io_req_byteen), - `UNUSED_PIN (io_req_addr), - `UNUSED_PIN (io_req_data), - `UNUSED_PIN (io_req_tag), - .io_req_ready (1'b1), - - // I/O response - .io_rsp_valid (1'b0), - .io_rsp_data (0), - .io_rsp_tag (0), - `UNUSED_PIN (io_rsp_ready), - // CSR I/O Request .csr_io_req_valid (vx_csr_io_req_valid), .csr_io_req_coreid(vx_csr_io_req_coreid), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 0eec3696..6a2a4d9c 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -55,13 +55,14 @@ module VX_bank #( input wire reset, // Core Request - input wire [NUM_REQS-1:0] core_req_valid, - input wire [`CORE_REQ_TAG_COUNT-1:0] core_req_rw, - input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, - input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, - input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data, - input wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, - output wire core_req_ready, + input wire core_req_valid, + input wire [`REQS_BITS-1:0] core_req_tid, + input wire core_req_rw, + input wire [WORD_SIZE-1:0] core_req_byteen, + input wire [`WORD_ADDR_WIDTH-1:0] core_req_addr, + input wire [`WORD_WIDTH-1:0] core_req_data, + input wire [CORE_TAG_WIDTH-1:0] core_req_tag, + output wire core_req_ready, // Core Response output wire core_rsp_valid, @@ -229,37 +230,21 @@ module VX_bank #( wire creq_push = (| core_req_valid) && core_req_ready; assign core_req_ready = !creq_full; - VX_bank_core_req_queue #( - .WORD_SIZE (WORD_SIZE), - .NUM_REQS (NUM_REQS), - .CREQ_SIZE (CREQ_SIZE), - .CORE_TAG_WIDTH (CORE_TAG_WIDTH), - .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS) + VX_generic_queue #( + .DATAW (CORE_TAG_WIDTH + `REQS_BITS + 1 + WORD_SIZE + `WORD_ADDR_WIDTH + `WORD_WIDTH), + .SIZE (CREQ_SIZE), + .BUFFERED (1), + .FASTRAM (1) ) core_req_queue ( - .clk (clk), - .reset (reset), - - // Enqueue - .push (creq_push), - .tag_in (core_req_tag), - .valids_in (core_req_valid), - .rw_in (core_req_rw), - .byteen_in (core_req_byteen), - .addr_in (core_req_addr), - .wdata_in (core_req_data), - - // Dequeue - .pop (creq_pop), - .tag_out (creq_tag_st0), - .tid_out (creq_tid_st0), - .rw_out (creq_rw_st0), - .byteen_out (creq_byteen_st0), - .addr_out (creq_addr_st0), - .wdata_out (creq_writeword_st0), - - // States - .empty (creq_empty), - .full (creq_full) + .clk (clk), + .reset (reset), + .push (creq_push), + .pop (creq_pop), + .data_in ({core_req_tag, core_req_tid, core_req_rw, core_req_byteen, core_req_addr, core_req_data}), + .data_out({creq_tag_st0, creq_tid_st0, creq_rw_st0, creq_byteen_st0, creq_addr_st0, creq_writeword_st0}), + .empty (creq_empty), + .full (creq_full), + `UNUSED_PIN (size) ); reg [$clog2(MSHR_SIZE+1)-1:0] mshr_pending_size; diff --git a/hw/rtl/cache/VX_bank_core_req_queue.v b/hw/rtl/cache/VX_bank_core_req_queue.v deleted file mode 100644 index ba1b458f..00000000 --- a/hw/rtl/cache/VX_bank_core_req_queue.v +++ /dev/null @@ -1,215 +0,0 @@ -`include "VX_cache_config.vh" - -module VX_bank_core_req_queue #( - // Size of a word in bytes - parameter WORD_SIZE = 1, - // Number of Word requests per cycle - parameter NUM_REQS = 1, - // Core Request Queue Size - parameter CREQ_SIZE = 1, - // core request tag size - parameter CORE_TAG_WIDTH = 1, - // size of tag id in core request tag - parameter CORE_TAG_ID_BITS = 0 -) ( - input wire clk, - input wire reset, - - // Enqueue - input wire push, - input wire [NUM_REQS-1:0] valids_in, - input wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] tag_in, - input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] addr_in, - input wire [`CORE_REQ_TAG_COUNT-1:0] rw_in, - input wire [NUM_REQS-1:0][WORD_SIZE-1:0] byteen_in, - input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] wdata_in, - - // Dequeue - input wire pop, - output wire [CORE_TAG_WIDTH-1:0] tag_out, - output wire [`WORD_ADDR_WIDTH-1:0] addr_out, - output wire rw_out, - output wire [WORD_SIZE-1:0] byteen_out, - output wire [`WORD_WIDTH-1:0] wdata_out, - output wire [`REQS_BITS-1:0] tid_out, - - // States - output wire empty, - output wire full -); - - wire [NUM_REQS-1:0] q_valids; - wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] q_tag; - wire [`CORE_REQ_TAG_COUNT-1:0] q_rw; - wire [NUM_REQS-1:0][WORD_SIZE-1:0] q_byteen; - wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] q_addr; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] q_wdata; - wire q_push; - wire q_pop; - wire q_empty; - wire q_full; - - always @(*) begin - assert(!push || (| valids_in)); - assert(!push || !full); - assert(!pop || !empty); - end - - VX_generic_queue #( - .DATAW ($bits(valids_in) + $bits(tag_in) + $bits(addr_in) + $bits(rw_in) + $bits(byteen_in) + $bits(wdata_in)), - .SIZE (CREQ_SIZE), - .BUFFERED (1), - .FASTRAM (1) - ) req_queue ( - .clk (clk), - .reset (reset), - .push (q_push), - .pop (q_pop), - .data_in ({valids_in, tag_in, addr_in, rw_in, byteen_in, wdata_in}), - .data_out ({q_valids, q_tag, q_addr, q_rw, q_byteen, q_wdata}), - .empty (q_empty), - .full (q_full), - `UNUSED_PIN (size) - ); - - if (NUM_REQS > 1) begin - - reg [`REQS_BITS-1:0] sel_idx, sel_idx_r; - reg [CORE_TAG_WIDTH-1:0] sel_tag, sel_tag_r; - reg [`WORD_ADDR_WIDTH-1:0] sel_addr, sel_addr_r; - reg sel_rw, sel_rw_r; - reg [WORD_SIZE-1:0] sel_byteen, sel_byteen_r; - reg [`WORD_WIDTH-1:0] sel_wdata, sel_wdata_r; - - reg [$clog2(NUM_REQS+1)-1:0] q_valids_cnt_r; - wire [$clog2(NUM_REQS+1)-1:0] q_valids_cnt_n; - wire [$clog2(NUM_REQS+1)-1:0] q_valids_cnt; - - reg [NUM_REQS-1:0] pop_mask; - reg fast_track; - wire fast_track_n; - - reg req_eop; // request end of packet - reg empty_r; - - assign q_push = push; - assign q_pop = pop && req_eop; - - wire [NUM_REQS-1:0] requests = q_valids & ~pop_mask; - - always @(*) begin - sel_idx = 0; - sel_tag = 'x; - sel_addr = 'x; - sel_rw = 'x; - sel_byteen = 'x; - sel_wdata = 'x; - - for (integer i = 0; i < NUM_REQS; i++) begin - if (requests[i]) begin - sel_idx = `REQS_BITS'(i); - sel_addr = q_addr[i]; - if (0 == CORE_TAG_ID_BITS) begin - sel_tag = q_tag[i]; - sel_rw = q_rw[i]; - end - sel_byteen = q_byteen[i]; - sel_wdata = q_wdata[i]; - break; - end - end - end - - VX_countones #( - .N(NUM_REQS) - ) counter ( - .valids (q_valids), - .count (q_valids_cnt) - ); - - assign fast_track_n = (!q_empty && (empty_r || (pop && fast_track))) ? 0 : - pop ? (q_valids_cnt_r == 2) : - fast_track; - - assign q_valids_cnt_n = (!q_empty && (empty_r || (pop && fast_track))) ? q_valids_cnt : - pop ? (q_valids_cnt_r - 1) : - q_valids_cnt_r; - - always @(posedge clk) begin - if (reset) begin - pop_mask <= 0; - fast_track <= 0; - q_valids_cnt_r <= 0; - req_eop <= 0; - empty_r <= 1; - end else begin - if (!q_empty - && (empty_r || (pop && fast_track))) begin - pop_mask <= (NUM_REQS'(1) << sel_idx); - end else if (pop) begin - if (q_valids_cnt_r == 1 || q_valids_cnt_r == 2) begin - pop_mask <= 0; - end else begin - pop_mask[sel_idx] <= 1; - end - end - q_valids_cnt_r <= q_valids_cnt_n; - fast_track <= fast_track_n; - req_eop <= (q_valids_cnt_n == 1 || q_valids_cnt_n == 2) && !fast_track_n; - empty_r <= (0 == q_valids_cnt_n); - end - - if (empty_r || pop) begin - sel_idx_r <= sel_idx; - sel_byteen_r <= sel_byteen; - sel_addr_r <= sel_addr; - sel_wdata_r <= sel_wdata; - end - end - - if (CORE_TAG_ID_BITS != 0) begin - `UNUSED_VAR (sel_tag) - `UNUSED_VAR (sel_rw) - always @(posedge clk) begin - if (empty_r || pop) begin - sel_tag_r <= q_tag; - sel_rw_r <= q_rw; - end - end - end else begin - always @(posedge clk) begin - if (empty_r || pop) begin - sel_tag_r <= sel_tag; - sel_rw_r <= sel_rw; - end - end - end - - assign tag_out = sel_tag_r; - assign addr_out = sel_addr_r; - assign rw_out = sel_rw_r; - assign byteen_out = sel_byteen_r; - assign wdata_out = sel_wdata_r; - assign tid_out = sel_idx_r; - - assign full = q_full; - assign empty = empty_r; - - end else begin - `UNUSED_VAR (q_valids) - - assign q_push = push; - assign q_pop = pop; - - assign tag_out = q_tag; - assign addr_out = q_addr; - assign rw_out = q_rw; - assign byteen_out = q_byteen; - assign wdata_out = q_wdata; - assign tid_out = 0; - - assign empty = q_empty; - assign full = q_full; - end - -endmodule \ No newline at end of file diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 7a7444e1..c3faec40 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -58,12 +58,12 @@ module VX_cache #( // Core request input wire [NUM_REQS-1:0] core_req_valid, - input wire [`CORE_REQ_TAG_COUNT-1:0] core_req_rw, + input wire [NUM_REQS-1:0] core_req_rw, input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data, - input wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, - output wire [`CORE_REQ_TAG_COUNT-1:0] core_req_ready, + input wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, + output wire [NUM_REQS-1:0] core_req_ready, // Core response output wire [NUM_REQS-1:0] core_rsp_valid, @@ -108,8 +108,8 @@ module VX_cache #( `STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value")) - wire [NUM_BANKS-1:0][NUM_REQS-1:0] per_bank_valid; - + wire [NUM_BANKS-1:0] per_bank_core_req_valid; + wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid; wire [NUM_BANKS-1:0] per_bank_core_req_ready; wire [NUM_BANKS-1:0] per_bank_core_rsp_valid; @@ -155,14 +155,14 @@ module VX_cache #( .BANK_LINE_SIZE (BANK_LINE_SIZE), .NUM_BANKS (NUM_BANKS), .WORD_SIZE (WORD_SIZE), - .NUM_REQS (NUM_REQS), - .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS) + .NUM_REQS (NUM_REQS) ) cache_core_req_bank_sel ( - .core_req_valid (core_req_valid), - .core_req_addr (core_req_addr), - .core_req_ready (core_req_ready), - .per_bank_valid (per_bank_valid), - .per_bank_ready (per_bank_core_req_ready) + .core_req_valid (core_req_valid), + .core_req_addr (core_req_addr), + .core_req_ready (core_req_ready), + .per_bank_valid (per_bank_core_req_valid), + .per_bank_tid (per_bank_core_req_tid), + .per_bank_ready (per_bank_core_req_ready) ); assign dram_req_tag = dram_req_addr; @@ -173,51 +173,53 @@ module VX_cache #( end for (genvar i = 0; i < NUM_BANKS; i++) begin - wire [NUM_REQS-1:0] curr_bank_core_req_valid; - wire [`CORE_REQ_TAG_COUNT-1:0] curr_bank_core_req_rw; - wire [NUM_REQS-1:0][WORD_SIZE-1:0] curr_bank_core_req_byteen; - wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] curr_bank_core_req_addr; - wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] curr_bank_core_req_tag; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] curr_bank_core_req_data; - wire curr_bank_core_req_ready; + wire curr_bank_core_req_valid; + wire [`REQS_BITS-1:0] curr_bank_core_req_tid; + wire curr_bank_core_req_rw; + wire [WORD_SIZE-1:0] curr_bank_core_req_byteen; + wire [`WORD_ADDR_WIDTH-1:0] curr_bank_core_req_addr; + wire [CORE_TAG_WIDTH-1:0] curr_bank_core_req_tag; + wire [`WORD_WIDTH-1:0] curr_bank_core_req_data; + wire curr_bank_core_req_ready; - wire curr_bank_core_rsp_valid; - wire [`REQS_BITS-1:0] curr_bank_core_rsp_tid; - wire [`WORD_WIDTH-1:0] curr_bank_core_rsp_data; - wire [CORE_TAG_WIDTH-1:0] curr_bank_core_rsp_tag; - wire curr_bank_core_rsp_ready; + wire curr_bank_core_rsp_valid; + wire [`REQS_BITS-1:0] curr_bank_core_rsp_tid; + wire [`WORD_WIDTH-1:0] curr_bank_core_rsp_data; + wire [CORE_TAG_WIDTH-1:0] curr_bank_core_rsp_tag; + wire curr_bank_core_rsp_ready; - wire curr_bank_dram_req_valid; - wire curr_bank_dram_req_rw; - wire [BANK_LINE_SIZE-1:0] curr_bank_dram_req_byteen; - wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_req_addr; - wire[`BANK_LINE_WIDTH-1:0] curr_bank_dram_req_data; - wire curr_bank_dram_req_ready; + wire curr_bank_dram_req_valid; + wire curr_bank_dram_req_rw; + wire [BANK_LINE_SIZE-1:0] curr_bank_dram_req_byteen; + wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_req_addr; + wire[`BANK_LINE_WIDTH-1:0] curr_bank_dram_req_data; + wire curr_bank_dram_req_ready; - wire curr_bank_dram_rsp_valid; - wire [`BANK_LINE_WIDTH-1:0] curr_bank_dram_rsp_data; - wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_rsp_addr; - wire curr_bank_dram_rsp_ready; + wire curr_bank_dram_rsp_valid; + wire [`BANK_LINE_WIDTH-1:0] curr_bank_dram_rsp_data; + wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_rsp_addr; + wire curr_bank_dram_rsp_ready; - wire curr_bank_snp_req_valid; - wire [`LINE_ADDR_WIDTH-1:0] curr_bank_snp_req_addr; - wire curr_bank_snp_req_inv; - wire [SNP_TAG_WIDTH-1:0] curr_bank_snp_req_tag; - wire curr_bank_snp_req_ready; + wire curr_bank_snp_req_valid; + wire [`LINE_ADDR_WIDTH-1:0] curr_bank_snp_req_addr; + wire curr_bank_snp_req_inv; + wire [SNP_TAG_WIDTH-1:0] curr_bank_snp_req_tag; + wire curr_bank_snp_req_ready; - wire curr_bank_snp_rsp_valid; - wire [SNP_TAG_WIDTH-1:0] curr_bank_snp_rsp_tag; - wire curr_bank_snp_rsp_ready; + wire curr_bank_snp_rsp_valid; + wire [SNP_TAG_WIDTH-1:0] curr_bank_snp_rsp_tag; + wire curr_bank_snp_rsp_ready; - wire curr_bank_miss; + wire curr_bank_miss; // Core Req - assign curr_bank_core_req_valid = per_bank_valid[i]; - assign curr_bank_core_req_addr = core_req_addr; - assign curr_bank_core_req_rw = core_req_rw; - assign curr_bank_core_req_byteen = core_req_byteen; - assign curr_bank_core_req_data = core_req_data; - assign curr_bank_core_req_tag = core_req_tag; + assign curr_bank_core_req_valid = per_bank_core_req_valid[i]; + assign curr_bank_core_req_tid = per_bank_core_req_tid[i]; + assign curr_bank_core_req_addr = core_req_addr[per_bank_core_req_tid[i]]; + assign curr_bank_core_req_rw = core_req_rw[per_bank_core_req_tid[i]]; + assign curr_bank_core_req_byteen = core_req_byteen[per_bank_core_req_tid[i]]; + assign curr_bank_core_req_data = core_req_data[per_bank_core_req_tid[i]]; + assign curr_bank_core_req_tag = core_req_tag[per_bank_core_req_tid[i]]; assign per_bank_core_req_ready[i] = curr_bank_core_req_ready; // Core WB @@ -298,6 +300,7 @@ module VX_cache #( .reset (reset), // Core request .core_req_valid (curr_bank_core_req_valid), + .core_req_tid (curr_bank_core_req_tid), .core_req_rw (curr_bank_core_req_rw), .core_req_byteen (curr_bank_core_req_byteen), .core_req_addr (curr_bank_core_req_addr), diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index e7a2ce66..f3454d33 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -2,77 +2,64 @@ module VX_cache_core_req_bank_sel #( // Size of line inside a bank in bytes - parameter BANK_LINE_SIZE = 1, + parameter BANK_LINE_SIZE = 1, // Size of a word in bytes - parameter WORD_SIZE = 1, + parameter WORD_SIZE = 1, // Number of banks - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 1, // Number of Word requests per cycle - parameter NUM_REQS = 1, - // size of tag id in core request tag - parameter CORE_TAG_ID_BITS = 1 + parameter NUM_REQS = 1 ) ( input wire [NUM_REQS-1:0] core_req_valid, input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, - output wire [`CORE_REQ_TAG_COUNT-1:0] core_req_ready, - - output wire [NUM_BANKS-1:0][NUM_REQS-1:0] per_bank_valid, - input wire [NUM_BANKS-1:0] per_bank_ready + output wire [NUM_REQS-1:0] core_req_ready, + output wire [NUM_BANKS-1:0] per_bank_valid, + output wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_tid, + input wire [NUM_BANKS-1:0] per_bank_ready ); - if (NUM_BANKS > 1) begin + if (NUM_BANKS > 1) begin + reg [NUM_BANKS-1:0] per_bank_valid_r; + reg [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_tid_r; + reg [NUM_REQS-1:0] core_req_ready_r; + wire [NUM_REQS-1:0][`BANK_BITS-1:0] core_req_bid; - reg [NUM_BANKS-1:0][NUM_REQS-1:0] per_bank_valid_r; + for (genvar i = 0; i < NUM_REQS; ++i) begin + assign core_req_bid[i] = core_req_addr[i][`BANK_SELECT_ADDR_RNG]; + end always @(*) begin - per_bank_valid_r = 0; - for (integer i = 0; i < NUM_REQS; i++) begin - per_bank_valid_r[core_req_addr[i][`BANK_SELECT_ADDR_RNG]][i] = core_req_valid[i]; + per_bank_valid_r = 0; + per_bank_tid_r = 'x; + for (integer i = NUM_REQS-1; i >= 0; --i) begin + if (core_req_valid[i]) begin + per_bank_valid_r[core_req_bid[i]] = 1; + per_bank_tid_r[core_req_bid[i]] = `REQS_BITS'(i); + end end - end + end - if (CORE_TAG_ID_BITS != 0) begin - - reg [NUM_BANKS-1:0] per_bank_ready_other, per_bank_ready_ignore; - - always @(*) begin - per_bank_ready_other = {NUM_BANKS{1'b1}}; - per_bank_ready_ignore = {NUM_BANKS{1'b1}}; - - for (integer i = 0; i < NUM_REQS; i++) begin - per_bank_ready_ignore[core_req_addr[i][`BANK_SELECT_ADDR_RNG]] = 1'b0; - end - - for (integer i = 0; i < NUM_BANKS; i++) begin - for (integer j = 0; j < NUM_BANKS; j++) begin - if (i != j) begin - per_bank_ready_other[i] &= (per_bank_ready[j] | per_bank_ready_ignore[j]); - end + always @(*) begin + core_req_ready_r = 0; + for (integer j = 0; j < NUM_BANKS; ++j) begin + for (integer i = 0; i < NUM_REQS; ++i) begin + if (core_req_valid[i] && (core_req_bid[i] == `BANK_BITS'(j))) begin + core_req_ready_r[i] = per_bank_ready[j]; + break; end end - end - - for (genvar i = 0; i < NUM_BANKS; i++) begin - for (genvar j = 0; j < NUM_REQS; j++) begin - assign per_bank_valid[i][j] = per_bank_valid_r[i][j] && per_bank_ready_other[i]; - end end - - assign core_req_ready[0] = & (per_bank_ready | per_bank_ready_ignore); - - end else begin - - assign per_bank_valid = per_bank_valid_r; - - for (genvar i = 0; i < NUM_REQS; i++) begin - assign core_req_ready[i] = per_bank_ready[core_req_addr[i][`BANK_SELECT_ADDR_RNG]]; - end - end + assign per_bank_valid = per_bank_valid_r; + assign per_bank_tid = per_bank_tid_r; + assign core_req_ready = core_req_ready_r; + end else begin + `UNUSED_VAR (core_req_valid) `UNUSED_VAR (core_req_addr) assign per_bank_valid = core_req_valid; + assign per_bank_tid = 0; assign core_req_ready[0] = per_bank_ready; end diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index b670b3d3..41ed5282 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -106,7 +106,6 @@ module VX_cache_core_rsp_merge #( end for (genvar i = 0; i < NUM_REQS; i++) begin - assign stall[i] = ~core_rsp_ready[i] && core_rsp_valid[i]; VX_generic_register #( diff --git a/hw/rtl/cache/VX_miss_resrv.v b/hw/rtl/cache/VX_miss_resrv.v index 25556ec0..49dfca41 100644 --- a/hw/rtl/cache/VX_miss_resrv.v +++ b/hw/rtl/cache/VX_miss_resrv.v @@ -57,7 +57,7 @@ module VX_miss_resrv #( output wire [`MSHR_DATA_WIDTH-1:0] dequeue_data_st0, input wire dequeue_st3 ); - reg [`LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0]; + `USE_FAST_BRAM reg [`LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0]; reg [MSHR_SIZE-1:0] valid_table; reg [MSHR_SIZE-1:0] ready_table; diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index 1b273539..d0842a2a 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -68,7 +68,7 @@ module VX_snp_forwarder #( wire sfq_acquire = snp_req_valid && snp_req_ready; wire sfq_release = snp_rsp_valid_unqual && snp_rsp_ready_unqual; - VX_cam_buffer #( + VX_index_buffer #( .DATAW (SRC_ADDR_WIDTH + 1 + TAG_IN_WIDTH), .SIZE (SREQ_SIZE), .FASTRAM (1) diff --git a/hw/rtl/fp_cores/altera/acl_gen.log b/hw/rtl/fp_cores/altera/acl_gen.log index 3e694b39..eb65a009 100644 --- a/hw/rtl/fp_cores/altera/acl_gen.log +++ b/hw/rtl/fp_cores/altera/acl_gen.log @@ -5,15 +5,15 @@ Generation context: HardFP is enabled enabling set to true Faithful rounding constraint detected Will not generate valid and channel signals - The new component name is acl_fdiv + The new component name is acl_s10_fdiv Frequency 250MHz - Deployment FPGA Arria10 -Estimated resources LUTs 539, DSPs 5, RAMBits 32768, RAMBlocks 3 -The pipeline depth of the block is 15 cycle(s) + Deployment FPGA Stratix10 +Estimated resources LUTs 681, DSPs 5, RAMBits 32768, RAMBlocks 3 +The pipeline depth of the block is 25 cycle(s) @@start @name FPDiv@ -@latency 15@ -@LUT 539@ +@latency 25@ +@LUT 681@ @DSP 5@ @RAMBits 32768@ @RAMBlockUsage 3@ @@ -34,15 +34,15 @@ Generation context: HardFP is enabled enabling set to true Faithful rounding constraint detected Will not generate valid and channel signals - The new component name is acl_fsqrt + The new component name is acl_s10_fsqrt Frequency 250MHz - Deployment FPGA Arria10 -Estimated resources LUTs 271, DSPs 3, RAMBits 15872, RAMBlocks 3 -The pipeline depth of the block is 10 cycle(s) + Deployment FPGA Stratix10 +Estimated resources LUTs 349, DSPs 3, RAMBits 15872, RAMBlocks 3 +The pipeline depth of the block is 17 cycle(s) @@start @name FPSqrt@ -@latency 10@ -@LUT 271@ +@latency 17@ +@LUT 349@ @DSP 3@ @RAMBits 15872@ @RAMBlockUsage 3@ @@ -62,15 +62,15 @@ Generation context: HardFP is enabled enabling set to true Faithful rounding constraint detected Will not generate valid and channel signals - The new component name is acl_ftoi + The new component name is acl_s10_ftoi Frequency 250MHz - Deployment FPGA Arria10 -Estimated resources LUTs 327, DSPs 0, RAMBits 0, RAMBlocks 0 + Deployment FPGA Stratix10 +Estimated resources LUTs 344, DSPs 0, RAMBits 0, RAMBlocks 0 The pipeline depth of the block is 3 cycle(s) @@start @name FPToFXP@ @latency 3@ -@LUT 327@ +@LUT 344@ @DSP 0@ @RAMBits 0@ @RAMBlockUsage 0@ @@ -90,15 +90,15 @@ Generation context: HardFP is enabled enabling set to true Faithful rounding constraint detected Will not generate valid and channel signals - The new component name is acl_ftou + The new component name is acl_s10_ftou Frequency 250MHz - Deployment FPGA Arria10 -Estimated resources LUTs 287, DSPs 0, RAMBits 0, RAMBlocks 0 + Deployment FPGA Stratix10 +Estimated resources LUTs 272, DSPs 0, RAMBits 0, RAMBlocks 0 The pipeline depth of the block is 3 cycle(s) @@start @name FPToFXP@ @latency 3@ -@LUT 287@ +@LUT 272@ @DSP 0@ @RAMBits 0@ @RAMBlockUsage 0@ @@ -118,15 +118,15 @@ Generation context: HardFP is enabled enabling set to true Faithful rounding constraint detected Will not generate valid and channel signals - The new component name is acl_itof + The new component name is acl_s10_itof Frequency 250MHz - Deployment FPGA Arria10 -Estimated resources LUTs 397, DSPs 0, RAMBits 0, RAMBlocks 0 + Deployment FPGA Stratix10 +Estimated resources LUTs 362, DSPs 0, RAMBits 0, RAMBlocks 0 The pipeline depth of the block is 7 cycle(s) @@start @name FXPToFP@ @latency 7@ -@LUT 397@ +@LUT 362@ @DSP 0@ @RAMBits 0@ @RAMBlockUsage 0@ @@ -146,15 +146,15 @@ Generation context: HardFP is enabled enabling set to true Faithful rounding constraint detected Will not generate valid and channel signals - The new component name is acl_utof + The new component name is acl_s10_utof Frequency 300MHz - Deployment FPGA Arria10 -Estimated resources LUTs 363, DSPs 0, RAMBits 0, RAMBlocks 0 + Deployment FPGA Stratix10 +Estimated resources LUTs 310, DSPs 0, RAMBits 0, RAMBlocks 0 The pipeline depth of the block is 7 cycle(s) @@start @name FXPToFP@ @latency 7@ -@LUT 363@ +@LUT 310@ @DSP 0@ @RAMBits 0@ @RAMBlockUsage 0@ diff --git a/hw/rtl/fp_cores/altera/acl_gen.sh b/hw/rtl/fp_cores/altera/acl_gen.sh index f26058eb..463b1f31 100755 --- a/hw/rtl/fp_cores/altera/acl_gen.sh +++ b/hw/rtl/fp_cores/altera/acl_gen.sh @@ -2,7 +2,7 @@ CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64 -OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding -noChanValid -enable -speedgrade 2" +OPTIONS="-target Stratix10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding -noChanValid -enable -speedgrade 2" export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH @@ -14,12 +14,12 @@ FBITS="f$(($EXP_BITS + $MAN_BITS + 1))" echo Generating IP cores for $FBITS { - $CMD -name acl_fdiv -frequency 250 FPDiv $EXP_BITS $MAN_BITS 0 - $CMD -name acl_fsqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS - $CMD -name acl_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1 - $CMD -name acl_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0 - $CMD -name acl_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS - $CMD -name acl_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS + $CMD -name acl_s10_fdiv -frequency 250 FPDiv $EXP_BITS $MAN_BITS 0 + $CMD -name acl_s10_fsqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS + $CMD -name acl_s10_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1 + $CMD -name acl_s10_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0 + $CMD -name acl_s10_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS + $CMD -name acl_s10_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS } > acl_gen.log 2>&1 #cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv . \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_cache_core_req_if.v b/hw/rtl/interfaces/VX_cache_core_req_if.v index cb9df392..8b01eef3 100644 --- a/hw/rtl/interfaces/VX_cache_core_req_if.v +++ b/hw/rtl/interfaces/VX_cache_core_req_if.v @@ -10,13 +10,13 @@ interface VX_cache_core_req_if #( parameter CORE_TAG_ID_BITS = 0 ) (); - wire [NUM_REQS-1:0] valid; - wire [`CORE_REQ_TAG_COUNT-1:0] rw; - wire [NUM_REQS-1:0][WORD_SIZE-1:0] byteen; - wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] addr; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] data; - wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] tag; - wire ready; + wire [NUM_REQS-1:0] valid; + wire [NUM_REQS-1:0] rw; + wire [NUM_REQS-1:0][WORD_SIZE-1:0] byteen; + wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] addr; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] data; + wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] tag; + wire [NUM_REQS-1:0] ready; endinterface diff --git a/hw/rtl/libs/VX_cam_buffer.v b/hw/rtl/libs/VX_index_buffer.v similarity index 88% rename from hw/rtl/libs/VX_cam_buffer.v rename to hw/rtl/libs/VX_index_buffer.v index 7f44ca85..2a28d061 100644 --- a/hw/rtl/libs/VX_cam_buffer.v +++ b/hw/rtl/libs/VX_index_buffer.v @@ -1,6 +1,6 @@ `include "VX_platform.vh" -module VX_cam_buffer #( +module VX_index_buffer #( parameter DATAW = 1, parameter SIZE = 1, parameter FASTRAM = 0, @@ -48,16 +48,18 @@ module VX_cam_buffer #( always @(posedge clk) begin if (reset) begin - free_slots <= {SIZE{1'b1}}; - full_r <= 1'b0; write_addr_r <= ADDRW'(1'b0); + free_slots <= {SIZE{1'b1}}; + full_r <= 1'b0; end else begin if (release_slot) begin assert(0 == free_slots[release_addr]) else $error("%t: releasing invalid slot at port %d", $time, release_addr); end - free_slots <= free_slots_n; - write_addr_r <= free_index; - full_r <= ~free_valid; + if (acquire_slot || full_r) begin + write_addr_r <= free_index; + end + free_slots <= free_slots_n; + full_r <= ~free_valid; end end diff --git a/hw/syn/quartus/top8/Makefile b/hw/syn/quartus/top8/Makefile index 373e8b74..1adeec6a 100644 --- a/hw/syn/quartus/top8/Makefile +++ b/hw/syn/quartus/top8/Makefile @@ -6,10 +6,12 @@ RTL_INCLUDE = $(FPU_INCLUDE);../../../rtl;../../../rtl/libs;../../../rtl/interfa PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf # Part, Family -FAMILY = "Arria 10" -#DEVICE = 1SX280HN2F43E2VG -DEVICE = 10AX115N3F40E2SG +FAMILY = "Stratix 10" +DEVICE = 1SX280HN2F43E2VG + +#FAMILY = "Arria 10" +#DEVICE = 10AX115N3F40E2SG # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on --set=VERILOG_MACRO=NOPAE=1