From 2a27bfbfd5ec1f80972b7b7e720c22c630e68168 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 23 Aug 2021 01:59:22 -0700 Subject: [PATCH 01/16] LKG Build (reset network update -fmax=236 mhz 4c) --- hw/rtl/VX_cluster.v | 4 +++- hw/rtl/VX_issue.v | 10 +++++++--- hw/rtl/VX_mem_unit.v | 10 ++++++---- hw/rtl/VX_platform.vh | 1 + hw/rtl/VX_warp_sched.v | 2 +- hw/rtl/Vortex.v | 4 +++- hw/rtl/afu/vortex_afu.sv | 18 +++++++++++++----- hw/rtl/cache/VX_cache.v | 22 +++++++++++++++------- hw/rtl/libs/VX_onehot_encoder.v | 2 +- hw/rtl/libs/VX_reset_relay.v | 33 +++++++++++++++++++-------------- hw/syn/opae/vortex_afu.qsf | 12 ++++++------ 11 files changed, 75 insertions(+), 43 deletions(-) diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 56c70793..2414828e 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -143,6 +143,8 @@ module VX_cluster #( end else begin + `RESET_RELAY (mem_arb_reset); + VX_mem_arb #( .NUM_REQS (`NUM_CORES), .DATA_WIDTH (`L2MEM_DATA_WIDTH), @@ -153,7 +155,7 @@ module VX_cluster #( .BUFFERED_RSP (1) ) mem_arb ( .clk (clk), - .reset (reset), + .reset (mem_arb_reset), // Core request .req_valid_in (per_core_mem_req_valid), diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 5dc41aea..8b00ddd9 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -30,11 +30,15 @@ module VX_issue #( wire scoreboard_delay; + `RESET_RELAY (ibuf_reset); + `RESET_RELAY (gpr_reset); + `RESET_RELAY (demux_reset); + VX_ibuffer #( .CORE_ID(CORE_ID) ) ibuffer ( .clk (clk), - .reset (reset), + .reset (ibuf_reset), .decode_if (decode_if), .ibuffer_if (ibuffer_if) ); @@ -58,7 +62,7 @@ module VX_issue #( .CORE_ID(CORE_ID) ) gpr_stage ( .clk (clk), - .reset (reset), + .reset (gpr_reset), .writeback_if (writeback_if), .gpr_req_if (gpr_req_if), .gpr_rsp_if (gpr_rsp_if) @@ -80,7 +84,7 @@ module VX_issue #( VX_instr_demux instr_demux ( .clk (clk), - .reset (reset), + .reset (demux_reset), .ibuffer_if (execute_if), .gpr_rsp_if (gpr_rsp_if), .alu_req_if (alu_req_if), diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index c68e051a..12623e3d 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -65,6 +65,7 @@ module VX_mem_unit # ( `RESET_RELAY (icache_reset); `RESET_RELAY (dcache_reset); + `RESET_RELAY (mem_arb_reset); VX_cache #( .CACHE_ID (`ICACHE_ID), @@ -197,6 +198,9 @@ module VX_mem_unit # ( .TAG_WIDTH (`DCORE_TAG_WIDTH-`SM_ENABLE) ) smem_rsp_if(); + `RESET_RELAY (smem_arb_reset); + `RESET_RELAY (smem_reset); + VX_smem_arb #( .NUM_REQS (2), .LANES (`NUM_THREADS), @@ -207,7 +211,7 @@ module VX_mem_unit # ( .BUFFERED_RSP (1) ) smem_arb ( .clk (clk), - .reset (reset), + .reset (smem_arb_reset), // input request .req_valid_in (dcache_req_if.valid), @@ -242,8 +246,6 @@ module VX_mem_unit # ( .rsp_ready_out (dcache_rsp_if.ready) ); - `RESET_RELAY (smem_reset); - VX_shared_mem #( .CACHE_ID (`SCACHE_ID), .CACHE_SIZE (`SMEM_SIZE), @@ -312,7 +314,7 @@ module VX_mem_unit # ( .BUFFERED_RSP (2) ) mem_arb ( .clk (clk), - .reset (reset), + .reset (mem_arb_reset), // Source request .req_valid_in ({dcache_mem_req_if.valid, icache_mem_req_if.valid}), diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 76b57c6b..14f26054 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -74,6 +74,7 @@ `define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *) `define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *) `define DISABLE_BRAM (* ramstyle = "logic" *) +`define PRESERVE_REG (* preserve *) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 26529498..89673310 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -202,7 +202,7 @@ module VX_warp_sched #( VX_priority_encoder #( .N (`NUM_WARPS) - ) rr_arbiter ( + ) pri_enc ( .data_in (ready_warps), .index (schedule_wid), .valid_out (schedule_valid), diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index d92c6cfd..4d871e0e 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -141,6 +141,8 @@ module Vortex ( end else begin + `RESET_RELAY (mem_arb_reset); + VX_mem_arb #( .NUM_REQS (`NUM_CLUSTERS), .DATA_WIDTH (`L3MEM_DATA_WIDTH), @@ -150,7 +152,7 @@ module Vortex ( .BUFFERED_RSP (1) ) mem_arb ( .clk (clk), - .reset (reset), + .reset (mem_arb_reset), // Core request .req_valid_in (per_cluster_mem_req_valid), diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index bfa276af..70208392 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -512,6 +512,8 @@ t_local_mem_data mem_rsp_data; wire [AVS_REQ_TAGW:0] mem_rsp_tag; wire mem_rsp_ready; +`RESET_RELAY (mem_arb_reset); + VX_mem_arb #( .NUM_REQS (2), .DATA_WIDTH (LMEM_DATA_WIDTH), @@ -522,7 +524,7 @@ VX_mem_arb #( .TYPE ("X") ) mem_arb ( .clk (clk), - .reset (reset), + .reset (mem_arb_reset), // Source request .req_valid_in ({vx_mem_req_arb_valid, cci_mem_req_arb_valid}), @@ -557,6 +559,8 @@ VX_mem_arb #( //-- +`RESET_RELAY (avs_wrapper_reset); + VX_avs_wrapper #( .AVS_DATA_WIDTH (LMEM_DATA_WIDTH), .AVS_ADDR_WIDTH (LMEM_ADDR_WIDTH), @@ -566,7 +570,7 @@ VX_avs_wrapper #( .RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE) ) avs_wrapper ( .clk (clk), - .reset (reset), + .reset (avs_wrapper_reset), // Memory request .mem_req_valid (mem_req_valid), @@ -724,13 +728,15 @@ always @(posedge clk) begin end end +`RESET_RELAY (cci_rdq_reset); + VX_fifo_queue #( .DATAW (CCI_RD_QUEUE_DATAW), .SIZE (CCI_RD_QUEUE_SIZE), .OUTPUT_REG (1) ) cci_rd_req_queue ( .clk (clk), - .reset (reset), + .reset (cci_rdq_reset), .push (cci_rdq_push), .pop (cci_rdq_pop), .data_in (cci_rdq_din), @@ -878,7 +884,7 @@ Vortex #() vortex ( `SCOPE_BIND_afu_vortex .clk (clk), - .reset (reset | vx_reset), + .reset (reset || vx_reset), // Memory request .mem_req_valid (vx_mem_req_valid), @@ -997,6 +1003,8 @@ VX_fifo_queue #( wire scope_changed = `SCOPE_TRIGGER; +`RESET_RELAY (scope_reset); + VX_scope #( .DATAW ($bits({`SCOPE_DATA_LIST,`SCOPE_UPDATE_LIST})), .BUSW (64), @@ -1004,7 +1012,7 @@ VX_scope #( .UPDW ($bits({`SCOPE_UPDATE_LIST})) ) scope ( .clk (clk), - .reset (reset), + .reset (scope_reset), .start (1'b0), .stop (1'b0), .changed (scope_changed), diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index bbe1e10a..eab2004e 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -134,7 +134,7 @@ module VX_cache #( wire mem_rsp_valid_nc; wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_nc; wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_nc; - wire mem_rsp_ready_nc; + wire mem_rsp_ready_nc; if (NC_ENABLE) begin VX_nc_bypass #( @@ -151,8 +151,8 @@ module VX_cache #( .MEM_TAG_IN_WIDTH (MEM_TAG_IN_WIDTH), .MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH) ) nc_bypass ( - .clk (clk), - .reset (reset), + .clk (clk), + .reset (reset), // Core request in .core_req_valid_in (core_req_valid), @@ -251,6 +251,8 @@ module VX_cache #( wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_qual; wire mrsq_out_valid, mrsq_out_ready; + + `RESET_RELAY (mrsq_reset); VX_elastic_buffer #( .DATAW (MEM_TAG_IN_WIDTH + `CACHE_LINE_WIDTH), @@ -258,7 +260,7 @@ module VX_cache #( .OUTPUT_REG (MRSQ_SIZE > 2) ) mem_rsp_queue ( .clk (clk), - .reset (reset), + .reset (mrsq_reset), .ready_in (mem_rsp_ready_nc), .valid_in (mem_rsp_valid_nc), .data_in ({mem_rsp_tag_nc, mem_rsp_data_nc}), @@ -274,13 +276,15 @@ module VX_cache #( wire [`LINE_SELECT_BITS-1:0] flush_addr; wire flush_enable; + `RESET_RELAY (flush_reset); + VX_flush_ctrl #( .CACHE_SIZE (CACHE_SIZE), .CACHE_LINE_SIZE (CACHE_LINE_SIZE), .NUM_BANKS (NUM_BANKS) ) flush_ctrl ( .clk (clk), - .reset (reset), + .reset (flush_reset), .addr_out (flush_addr), .valid_out (flush_enable) ); @@ -435,6 +439,8 @@ module VX_cache #( assign curr_bank_mem_rsp_id = `MEM_TAG_TO_REQ_ID(mem_rsp_tag_qual); assign curr_bank_mem_rsp_data = mem_rsp_data_qual; assign per_bank_mem_rsp_ready[i] = curr_bank_mem_rsp_ready; + + `RESET_RELAY (bank_reset); VX_bank #( .BANK_ID (i), @@ -457,7 +463,7 @@ module VX_cache #( `SCOPE_BIND_VX_cache_bank(i) .clk (clk), - .reset (reset), + .reset (bank_reset), `ifdef PERF_ENABLE .perf_read_misses (perf_read_miss_per_bank[i]), @@ -539,13 +545,15 @@ module VX_cache #( wire [MSHR_ADDR_WIDTH-1:0] mem_req_id; + `RESET_RELAY (mreq_reset); + VX_stream_arbiter #( .NUM_REQS (NUM_BANKS), .DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH), .BUFFERED (1) ) mem_req_arb ( .clk (clk), - .reset (reset), + .reset (mreq_reset), .valid_in (per_bank_mem_req_valid), .data_in (data_in), .ready_in (per_bank_mem_req_ready), diff --git a/hw/rtl/libs/VX_onehot_encoder.v b/hw/rtl/libs/VX_onehot_encoder.v index e1362078..8b85acb6 100644 --- a/hw/rtl/libs/VX_onehot_encoder.v +++ b/hw/rtl/libs/VX_onehot_encoder.v @@ -1,7 +1,7 @@ `include "VX_platform.vh" // Fast encoder using parallel prefix computation -// Adapter from BaseJump STL: http://bjump.org/data_out.html +// Adapted from BaseJump STL: http://bjump.org/data_out.html `TRACING_OFF module VX_onehot_encoder #( diff --git a/hw/rtl/libs/VX_reset_relay.v b/hw/rtl/libs/VX_reset_relay.v index facca82b..94b24250 100644 --- a/hw/rtl/libs/VX_reset_relay.v +++ b/hw/rtl/libs/VX_reset_relay.v @@ -1,26 +1,31 @@ `include "VX_platform.vh" -`TRACING_OFF module VX_reset_relay #( - parameter ASYNC = 0 + parameter N = 1, + parameter DEPTH = 1 ) ( input wire clk, input wire reset, - output wire reset_o + output wire [N-1:0] reset_o ); - (* preserve *) reg reset_r; - if (ASYNC) begin - always @(posedge clk or posedge reset) begin - reset_r <= reset; - end - end else begin + if (DEPTH > 1) begin + `PRESERVE_REG `DISABLE_BRAM reg [N-1:0] reset_r [DEPTH-1:0]; always @(posedge clk) begin - reset_r <= reset; + for (integer i = DEPTH-1; i > 0; --i) + reset_r[i] <= reset_r[i-1]; + reset_r[0] <= {N{reset}}; + end + assign reset_o = reset_r[DEPTH-1]; + end else if (DEPTH == 1) begin + `PRESERVE_REG reg [N-1:0] reset_r; + always @(posedge clk) begin + reset_r <= {N{reset}}; end + assign reset_o = reset_r; + end else begin + `UNUSED_VAR (clk) + assign reset_o = {N{reset}}; end - - assign reset_o = reset_r; -endmodule -`TRACING_ON \ No newline at end of file +endmodule \ No newline at end of file diff --git a/hw/syn/opae/vortex_afu.qsf b/hw/syn/opae/vortex_afu.qsf index 0c7e4cbd..1628f9d8 100644 --- a/hw/syn/opae/vortex_afu.qsf +++ b/hw/syn/opae/vortex_afu.qsf @@ -23,12 +23,12 @@ set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON -set_global_assignment -name USE_HIGH_SPEED_ADDER ON -set_global_assignment -name MUX_RESTRUCTURE ON -set_global_assignment -name ADV_NETLIST_OPT_SYNTH_WYSIWYG_REMAP ON -set_global_assignment -name PROGRAMMABLE_POWER_TECHNOLOGY_SETTING "FORCE ALL TILES WITH FAILING TIMING PATHS TO HIGH SPEED" -set_global_assignment -name PHYSICAL_SYNTHESIS_COMBO_LOGIC ON -set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_RETIMING ON +#set_global_assignment -name USE_HIGH_SPEED_ADDER ON +#set_global_assignment -name MUX_RESTRUCTURE ON +#set_global_assignment -name ADV_NETLIST_OPT_SYNTH_WYSIWYG_REMAP ON +#set_global_assignment -name PROGRAMMABLE_POWER_TECHNOLOGY_SETTING "FORCE ALL TILES WITH FAILING TIMING PATHS TO HIGH SPEED" +#set_global_assignment -name PHYSICAL_SYNTHESIS_COMBO_LOGIC ON +#set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_RETIMING ON set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0 set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100 From e494860f3840ece97b9f0fd90a75831fe5d66eab Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 26 Aug 2021 07:29:47 -0700 Subject: [PATCH 02/16] using lzc instead of priority_encoder --- hw/rtl/VX_instr_demux.v | 11 +++++------ hw/rtl/VX_warp_sched.v | 18 ++++++++---------- hw/rtl/cache/VX_miss_resrv.v | 26 ++++++++++++-------------- hw/rtl/libs/VX_index_buffer.v | 11 +++++------ 4 files changed, 30 insertions(+), 36 deletions(-) diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 1fd79f37..e497bfcb 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -26,13 +26,12 @@ module VX_instr_demux ( `endif wire gpu_req_ready; - VX_priority_encoder #( - .N (`NUM_THREADS) + VX_lzc #( + .WIDTH (`NUM_THREADS) ) tid_select ( - .data_in (ibuffer_if.tmask), - .index (tid), - `UNUSED_PIN (onehot), - `UNUSED_PIN (valid_out) + .in_i (ibuffer_if.tmask), + .cnt_o (tid), + `UNUSED_PIN (valid_o) ); wire [31:0] next_PC = ibuffer_if.PC + 4; diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 89673310..f6bff82f 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -30,7 +30,7 @@ module VX_warp_sched #( reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks; - reg [`NUM_WARPS-1:0][31:0] warp_pcs, warp_next_pcs; + reg [`NUM_WARPS-1:0][31:0] warp_pcs; // barriers reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks; // warps waiting on barrier @@ -121,12 +121,11 @@ module VX_warp_sched #( end if (ifetch_req_fire) begin - warp_next_pcs[ifetch_req_if.wid] <= ifetch_req_if.PC + 4; + warp_pcs[ifetch_req_if.wid] <= ifetch_req_if.PC + 4; end if (wstall_if.valid) begin stalled_warps[wstall_if.wid] <= wstall_if.stalled; - warp_pcs[wstall_if.wid] <= warp_next_pcs[wstall_if.wid]; end // join handling @@ -200,13 +199,12 @@ module VX_warp_sched #( wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls); - VX_priority_encoder #( - .N (`NUM_WARPS) - ) pri_enc ( - .data_in (ready_warps), - .index (schedule_wid), - .valid_out (schedule_valid), - `UNUSED_PIN (onehot) + VX_lzc #( + .WIDTH (`NUM_WARPS) + ) wid_select ( + .in_i (ready_warps), + .cnt_o (schedule_wid), + .valid_o (schedule_valid) ); wire [`NUM_WARPS-1:0][(`NUM_THREADS + 32)-1:0] schedule_data; diff --git a/hw/rtl/cache/VX_miss_resrv.v b/hw/rtl/cache/VX_miss_resrv.v index 1fab4a63..46802a70 100644 --- a/hw/rtl/cache/VX_miss_resrv.v +++ b/hw/rtl/cache/VX_miss_resrv.v @@ -102,22 +102,20 @@ module VX_miss_resrv #( end end - VX_priority_encoder #( - .N (MSHR_SIZE) - ) dequeue_pe ( - .data_in (valid_table_x & ready_table_x), - .index (dequeue_id_x), - .valid_out (dequeue_val_x), - `UNUSED_PIN (onehot) + VX_lzc #( + .WIDTH (MSHR_SIZE) + ) dequeue_sel ( + .in_i (valid_table_x & ready_table_x), + .cnt_o (dequeue_id_x), + .valid_o (dequeue_val_x) ); - VX_priority_encoder #( - .N (MSHR_SIZE) - ) allocate_pe ( - .data_in (~valid_table_n), - .index (allocate_id_n), - .valid_out (allocate_rdy_n), - `UNUSED_PIN (onehot) + VX_lzc #( + .WIDTH (MSHR_SIZE) + ) allocate_sel ( + .in_i (~valid_table_n), + .cnt_o (allocate_id_n), + .valid_o (allocate_rdy_n) ); always @(*) begin diff --git a/hw/rtl/libs/VX_index_buffer.v b/hw/rtl/libs/VX_index_buffer.v index 8a9e7607..689485a5 100644 --- a/hw/rtl/libs/VX_index_buffer.v +++ b/hw/rtl/libs/VX_index_buffer.v @@ -29,13 +29,12 @@ module VX_index_buffer #( wire free_valid; wire [ADDRW-1:0] free_index; - VX_priority_encoder #( - .N (SIZE) + VX_lzc #( + .WIDTH (SIZE) ) free_slots_encoder ( - .data_in (free_slots_n), - .index (free_index), - `UNUSED_PIN (onehot), - .valid_out (free_valid) + .in_i (free_slots_n), + .cnt_o (free_index), + .valid_o (free_valid) ); always @(*) begin From 06a6857508a355eb22b7132964b5ed4b69a92113 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 26 Aug 2021 08:05:54 -0700 Subject: [PATCH 03/16] using lzc instead of priority encoder --- hw/rtl/cache/VX_nc_bypass.v | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/hw/rtl/cache/VX_nc_bypass.v b/hw/rtl/cache/VX_nc_bypass.v index a1cf3156..93ef5847 100644 --- a/hw/rtl/cache/VX_nc_bypass.v +++ b/hw/rtl/cache/VX_nc_bypass.v @@ -99,7 +99,6 @@ module VX_nc_bypass #( // core request handling wire [NUM_REQS-1:0] core_req_valid_in_nc; - wire [NUM_REQS-1:0] core_req_nc_sel; wire [NUM_REQS-1:0] core_req_nc_tids; wire [`UP(CORE_REQ_TIDW)-1:0] core_req_nc_tid; wire core_req_nc_valid; @@ -110,13 +109,12 @@ module VX_nc_bypass #( assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_tids; - VX_priority_encoder #( - .N (NUM_REQS) + VX_lzc #( + .WIDTH (NUM_REQS) ) core_req_sel ( - .data_in (core_req_valid_in_nc), - .index (core_req_nc_tid), - .onehot (core_req_nc_sel), - .valid_out (core_req_nc_valid) + .in_i (core_req_valid_in_nc), + .cnt_o (core_req_nc_tid), + .valid_o (core_req_nc_valid) ); assign core_req_valid_out = core_req_valid_in & ~core_req_nc_tids; @@ -139,10 +137,9 @@ module VX_nc_bypass #( if (NUM_REQS > 1) begin for (genvar i = 0; i < NUM_REQS; ++i) begin assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? - (~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i]) : core_req_ready_out[i]; + (~mem_req_valid_in && mem_req_ready_out && (core_req_nc_tid == i)) : core_req_ready_out[i]; end end else begin - `UNUSED_VAR (core_req_nc_sel) assign core_req_ready_in = core_req_valid_in_nc ? (~mem_req_valid_in && mem_req_ready_out) : core_req_ready_out; end @@ -176,14 +173,7 @@ module VX_nc_bypass #( assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]}; end - VX_onehot_mux #( - .DATAW (MUX_DATAW), - .N (NUM_REQS) - ) core_req_nc_mux ( - .data_in (core_req_nc_mux_in), - .sel_in (core_req_nc_sel), - .data_out ({core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel}) - ); + assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_tid]; assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel; assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH]; From d91d56d126637f80d53dbb5ccf8b82266b6e527a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 26 Aug 2021 08:19:44 -0700 Subject: [PATCH 04/16] block ram refactoring (multi-porting supporting and simulation support) --- hw/rtl/VX_gpr_ram_f.v | 37 ----- hw/rtl/VX_gpr_ram_i.v | 34 ---- hw/rtl/VX_gpr_stage.v | 62 ++++---- hw/rtl/VX_icache_stage.v | 19 +-- hw/rtl/VX_ipdom_stack.v | 20 +-- hw/rtl/cache/VX_data_access.v | 14 +- hw/rtl/cache/VX_miss_resrv.v | 20 +-- hw/rtl/cache/VX_shared_mem.v | 32 ++-- hw/rtl/cache/VX_tag_access.v | 15 +- hw/rtl/libs/VX_dp_ram.v | 274 ++++++++++++++++++++++---------- hw/rtl/libs/VX_elastic_buffer.v | 4 +- hw/rtl/libs/VX_fifo_queue.v | 32 ++-- hw/rtl/libs/VX_index_buffer.v | 28 ++-- hw/rtl/libs/VX_sp_ram.v | 242 ++++++++++++++++++---------- 14 files changed, 480 insertions(+), 353 deletions(-) delete mode 100644 hw/rtl/VX_gpr_ram_f.v delete mode 100644 hw/rtl/VX_gpr_ram_i.v diff --git a/hw/rtl/VX_gpr_ram_f.v b/hw/rtl/VX_gpr_ram_f.v deleted file mode 100644 index 68c2a69f..00000000 --- a/hw/rtl/VX_gpr_ram_f.v +++ /dev/null @@ -1,37 +0,0 @@ -`include "VX_define.vh" - -`TRACING_OFF - -module VX_gpr_ram_f #( - parameter DATAW = 1, - parameter DEPTH = 1, - parameter ADDRW = $clog2(DEPTH) -) ( - input wire clk, - input wire wren, - input wire [ADDRW-1:0] waddr, - input wire [DATAW-1:0] wdata, - input wire [ADDRW-1:0] raddr1, - input wire [ADDRW-1:0] raddr2, - input wire [ADDRW-1:0] raddr3, - output wire [DATAW-1:0] rdata1, - output wire [DATAW-1:0] rdata2, - output wire [DATAW-1:0] rdata3 -); - reg [DATAW-1:0] mem [DEPTH-1:0]; - - initial mem = '{default: 0}; - - always @(posedge clk) begin - if (wren) begin - mem [waddr] <= wdata; - end - end - - assign rdata1 = mem [raddr1]; - assign rdata2 = mem [raddr2]; - assign rdata3 = mem [raddr3]; - -endmodule - -`TRACING_ON \ No newline at end of file diff --git a/hw/rtl/VX_gpr_ram_i.v b/hw/rtl/VX_gpr_ram_i.v deleted file mode 100644 index 6c96b871..00000000 --- a/hw/rtl/VX_gpr_ram_i.v +++ /dev/null @@ -1,34 +0,0 @@ -`include "VX_define.vh" - -`TRACING_OFF - -module VX_gpr_ram_i #( - parameter DATAW = 1, - parameter DEPTH = 1, - parameter ADDRW = $clog2(DEPTH) -) ( - input wire clk, - input wire wren, - input wire [ADDRW-1:0] waddr, - input wire [DATAW-1:0] wdata, - input wire [ADDRW-1:0] raddr1, - input wire [ADDRW-1:0] raddr2, - output wire [DATAW-1:0] rdata1, - output wire [DATAW-1:0] rdata2 -); - reg [DATAW-1:0] mem [DEPTH-1:0]; - - initial mem = '{default: 0}; - - always @(posedge clk) begin - if (wren) begin - mem [waddr] <= wdata; - end - end - - assign rdata1 = mem [raddr1]; - assign rdata2 = mem [raddr2]; - -endmodule - -`TRACING_ON \ No newline at end of file diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 9367dd75..56f484bc 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -21,9 +21,9 @@ module VX_gpr_stage #( wire write_enable = writeback_if.valid && (writeback_if.rd != 0); `ifdef EXT_F_ENABLE - localparam RAM_DEPTH = `NUM_WARPS * `NUM_REGS; + localparam RAM_SIZE = `NUM_WARPS * `NUM_REGS; wire [`NUM_THREADS-1:0][31:0] rdata1, rdata2, rdata3; - wire [$clog2(RAM_DEPTH)-1:0] waddr, raddr1, raddr2, raddr3; + wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2, raddr3; assign waddr = {writeback_if.wid, writeback_if.rd}; assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1}; @@ -31,20 +31,20 @@ module VX_gpr_stage #( assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3}; for (genvar i = 0; i < `NUM_THREADS; i++) begin - VX_gpr_ram_f #( - .DATAW (32), - .DEPTH (RAM_DEPTH) - ) gpr_ram_f ( - .clk (clk), - .wren (write_enable && writeback_if.tmask[i]), - .waddr (waddr), - .wdata (writeback_if.data[i]), - .raddr1 (raddr1), - .raddr2 (raddr2), - .raddr3 (raddr3), - .rdata1 (rdata1[i]), - .rdata2 (rdata2[i]), - .rdata3 (rdata3[i]) + VX_dp_ram #( + .RD_PORTS (3), + .DATAW (32), + .SIZE (RAM_SIZE), + .INIT_ENABLE (1), + .INIT_VALUE (0) + ) dp_ram ( + .clk (clk), + .wren (write_enable && writeback_if.tmask[i]), + .waddr (waddr), + .wdata (writeback_if.data[i]), + .rden (3'b111), + .raddr ({raddr3, raddr2, raddr1}), + .rdata ({rdata3[i], rdata2[i], rdata1[i]}) ); end @@ -52,9 +52,9 @@ module VX_gpr_stage #( assign gpr_rsp_if.rs2_data = rdata2; assign gpr_rsp_if.rs3_data = rdata3; `else - localparam RAM_DEPTH = `NUM_WARPS * `NUM_REGS; + localparam RAM_SIZE = `NUM_WARPS * `NUM_REGS; wire [`NUM_THREADS-1:0][31:0] rdata1, rdata2; - wire [$clog2(RAM_DEPTH)-1:0] waddr, raddr1, raddr2; + wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2; assign waddr = {writeback_if.wid, writeback_if.rd}; assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1}; @@ -62,18 +62,20 @@ module VX_gpr_stage #( `UNUSED_VAR (gpr_req_if.rs3) for (genvar i = 0; i < `NUM_THREADS; i++) begin - VX_gpr_ram_i #( - .DATAW (32), - .DEPTH (RAM_DEPTH) - ) gpr_ram_i ( - .clk (clk), - .wren (write_enable && writeback_if.tmask[i]), - .waddr (waddr), - .wdata (writeback_if.data[i]), - .raddr1 (raddr1), - .raddr2 (raddr2), - .rdata1 (rdata1[i]), - .rdata2 (rdata2[i]) + VX_dp_ram #( + .RD_PORTS (2), + .DATAW (32), + .SIZE (RAM_SIZE), + .INIT_ENABLE (1), + .INIT_VALUE (0) + ) dp_ram ( + .clk (clk), + .wren (write_enable && writeback_if.tmask[i]), + .waddr (waddr), + .wdata (writeback_if.data[i]), + .rden (2'b11), + .raddr ({raddr2, raddr1}), + .rdata ({rdata2[i], rdata1[i]}) ); end diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index 1e198b11..1d48bf3e 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -33,16 +33,17 @@ module VX_icache_stage #( wire [`NUM_THREADS-1:0] rsp_tmask; VX_dp_ram #( - .DATAW(32 + `NUM_THREADS), - .SIZE(`NUM_WARPS), - .FASTRAM(1) + .DATAW (32 + `NUM_THREADS), + .SIZE (`NUM_WARPS), + .LUTRAM (1) ) req_metadata ( - .clk(clk), - .waddr(req_tag), - .raddr(rsp_tag), - .wren(icache_req_fire), - .din({ifetch_req_if.PC, ifetch_req_if.tmask}), - .dout({rsp_PC, rsp_tmask}) + .clk (clk), + .wren (icache_req_fire), + .waddr (req_tag), + .wdata ({ifetch_req_if.PC, ifetch_req_if.tmask}), + .rden (1'b1), + .raddr (rsp_tag), + .rdata ({rsp_PC, rsp_tmask}) ); `RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR), diff --git a/hw/rtl/VX_ipdom_stack.v b/hw/rtl/VX_ipdom_stack.v index 6b026279..357f7c18 100644 --- a/hw/rtl/VX_ipdom_stack.v +++ b/hw/rtl/VX_ipdom_stack.v @@ -38,17 +38,17 @@ module VX_ipdom_stack #( end VX_dp_ram #( - .DATAW(WIDTH * 2), - .SIZE(DEPTH), - .RWCHECK(1), - .FASTRAM(1) + .DATAW (WIDTH * 2), + .SIZE (DEPTH), + .LUTRAM (1) ) store ( - .clk(clk), - .waddr(wr_ptr), - .raddr(rd_ptr), - .wren(push), - .din({q2, q1}), - .dout({d2, d1}) + .clk (clk), + .wren (push), + .waddr (wr_ptr), + .wdata ({q2, q1}), + .rden (1'b1), + .raddr (rd_ptr), + .rdata ({d2, d1}) ); always @(posedge clk) begin diff --git a/hw/rtl/cache/VX_data_access.v b/hw/rtl/cache/VX_data_access.v index e8942bdd..f3700e5f 100644 --- a/hw/rtl/cache/VX_data_access.v +++ b/hw/rtl/cache/VX_data_access.v @@ -65,14 +65,14 @@ module VX_data_access #( VX_sp_ram #( .DATAW (CACHE_LINE_SIZE * 8), .SIZE (`LINES_PER_BANK), - .BYTEENW (BYTEENW), - .RWCHECK (1) + .BYTEENW (BYTEENW) ) data_store ( - .clk(clk), - .addr(line_addr), - .wren({BYTEENW{writeen}} & byte_enable), - .din(wdata), - .dout(rdata) + .clk (clk), + .addr (line_addr), + .wren ({BYTEENW{writeen}} & byte_enable), + .wdata (wdata), + .rden (1'b1), + .rdata (rdata) ); `UNUSED_VAR (stall) diff --git a/hw/rtl/cache/VX_miss_resrv.v b/hw/rtl/cache/VX_miss_resrv.v index 46802a70..b7a60580 100644 --- a/hw/rtl/cache/VX_miss_resrv.v +++ b/hw/rtl/cache/VX_miss_resrv.v @@ -169,17 +169,17 @@ module VX_miss_resrv #( `LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id)) VX_dp_ram #( - .DATAW (`MSHR_DATA_WIDTH), - .SIZE (MSHR_SIZE), - .RWCHECK (1), - .FASTRAM (1) + .DATAW (`MSHR_DATA_WIDTH), + .SIZE (MSHR_SIZE), + .LUTRAM (1) ) entries ( - .clk (clk), - .waddr (allocate_id_r), - .raddr (dequeue_id_r), - .wren (allocate_valid), - .din (allocate_data), - .dout (dequeue_data) + .clk (clk), + .waddr (allocate_id_r), + .raddr (dequeue_id_r), + .wren (allocate_valid), + .wdata (allocate_data), + .rden (1'b1), + .rdata (dequeue_data) ); assign allocate_ready = allocate_rdy_r; diff --git a/hw/rtl/cache/VX_shared_mem.v b/hw/rtl/cache/VX_shared_mem.v index 2f475595..71a49b3c 100644 --- a/hw/rtl/cache/VX_shared_mem.v +++ b/hw/rtl/cache/VX_shared_mem.v @@ -173,14 +173,14 @@ module VX_shared_mem #( VX_sp_ram #( .DATAW (`WORD_WIDTH), .SIZE (`LINES_PER_BANK), - .BYTEENW (WORD_SIZE), - .RWCHECK (1) + .BYTEENW (WORD_SIZE) ) data_store ( - .clk (clk), - .addr (per_bank_core_req_addr[i]), - .wren ({WORD_SIZE{wren}} & per_bank_core_req_byteen[i]), - .din (per_bank_core_req_data[i]), - .dout (per_bank_core_rsp_data[i]) + .clk (clk), + .addr (per_bank_core_req_addr[i]), + .wren ({WORD_SIZE{wren}} & per_bank_core_req_byteen[i]), + .wdata (per_bank_core_req_data[i]), + .rden (1'b1), + .rdata (per_bank_core_rsp_data[i]) ); end @@ -216,18 +216,19 @@ module VX_shared_mem #( reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_in; reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_in; - always @(*) begin - core_rsp_valids_in = 0; - core_rsp_data_in = 'x; - core_rsp_tag_in = 'x; - bank_rsp_sel_cur = 0; - + always @(*) begin + core_rsp_tag_in = 'x; for (integer i = NUM_BANKS-1; i >= 0; --i) begin if (per_bank_req_reads[i] && ~bank_rsp_sel_prv[i]) begin core_rsp_tag_in = per_bank_core_req_tag[i]; end end + end + always @(*) begin + core_rsp_valids_in = 0; + core_rsp_data_in = 'x; + bank_rsp_sel_cur = 0; for (integer i = 0; i < NUM_BANKS; i++) begin if (per_bank_core_req_valid[i] && (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin @@ -278,13 +279,16 @@ module VX_shared_mem #( reg [CORE_TAG_WIDTH-1:0] core_req_tag_sel; `IGNORE_UNUSED_END - always @(*) begin + always @(*) begin core_req_tag_sel ='x; for (integer i = NUM_BANKS-1; i >= 0; --i) begin if (per_bank_core_req_valid[i]) begin core_req_tag_sel = per_bank_core_req_tag[i]; end end + end + + always @(*) begin is_multi_tag_req = 0; for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid[i] diff --git a/hw/rtl/cache/VX_tag_access.v b/hw/rtl/cache/VX_tag_access.v index 627c6570..0c51de01 100644 --- a/hw/rtl/cache/VX_tag_access.v +++ b/hw/rtl/cache/VX_tag_access.v @@ -48,14 +48,15 @@ module VX_tag_access #( VX_sp_ram #( .DATAW(`TAG_SELECT_BITS + 1), .SIZE(`LINES_PER_BANK), - .INITZERO(1), - .RWCHECK(1) + .INIT_ENABLE(1), + .INIT_VALUE(0) ) tag_store ( - .clk(clk), - .addr(line_addr), - .wren(fill), - .din({!is_flush, line_tag}), - .dout({read_valid, read_tag}) + .clk( clk), + .addr (line_addr), + .wren (fill), + .wdata ({!is_flush, line_tag}), + .rden (1'b1), + .rdata ({read_valid, read_tag}) ); assign tag_match = read_valid && (line_tag == read_tag); diff --git a/hw/rtl/libs/VX_dp_ram.v b/hw/rtl/libs/VX_dp_ram.v index 5914a462..e1d6defa 100644 --- a/hw/rtl/libs/VX_dp_ram.v +++ b/hw/rtl/libs/VX_dp_ram.v @@ -2,178 +2,286 @@ `TRACING_OFF module VX_dp_ram #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter BYTEENW = 1, - parameter OUTPUT_REG = 0, - parameter RWCHECK = 1, - parameter ADDRW = $clog2(SIZE), - parameter FASTRAM = 0, - parameter INITZERO = 0 + parameter RD_PORTS = 1, + parameter DATAW = 1, + parameter SIZE = 1, + parameter BYTEENW = 1, + parameter OUTPUT_REG = 0, + parameter NO_RWCHECK = 0, + parameter ADDRW = $clog2(SIZE), + parameter LUTRAM = 0, + parameter INIT_ENABLE = 0, + parameter INIT_FILE = "", + parameter [DATAW-1:0] INIT_VALUE = 0 ) ( - input wire clk, - input wire [ADDRW-1:0] waddr, - input wire [ADDRW-1:0] raddr, - input wire [BYTEENW-1:0] wren, - input wire [DATAW-1:0] din, - output wire [DATAW-1:0] dout + input wire clk, + input wire [BYTEENW-1:0] wren, + input wire [ADDRW-1:0] waddr, + input wire [DATAW-1:0] wdata, + input wire [RD_PORTS-1:0] rden, + input wire [RD_PORTS-1:0][ADDRW-1:0] raddr, + output wire [RD_PORTS-1:0][DATAW-1:0] rdata ); `STATIC_ASSERT((1 == BYTEENW) || ((BYTEENW > 1) && 0 == (BYTEENW % 4)), ("invalid parameter")) + `STATIC_ASSERT(!LUTRAM || (RD_PORTS == 1), ("multi-porting not supported on LUTRAM")) - if (FASTRAM) begin + +`define RAM_INITIALIZATION \ + if (INIT_ENABLE) begin \ + if (INIT_FILE != "") begin \ + initial $readmemh(INIT_FILE, ram); \ + end else begin \ + initial ram = '{default: INIT_VALUE}; \ + end \ + end + +`ifdef SYNTHESIS + if (LUTRAM) begin if (OUTPUT_REG) begin - reg [DATAW-1:0] dout_r; - + reg [DATAW-1:0] rdata_r; if (BYTEENW > 1) begin - `USE_FAST_BRAM reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + `USE_FAST_BRAM reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[waddr][i] <= din[i * 8 +: 8]; + ram[waddr][i] <= wdata[i * 8 +: 8]; end - dout_r <= mem[raddr]; + if (rden) + rdata_r <= ram[raddr]; end end else begin - `USE_FAST_BRAM reg [DATAW-1:0] mem [SIZE-1:0]; + `USE_FAST_BRAM reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[waddr] <= din; - dout_r <= mem[raddr]; + ram[waddr] <= wdata; + if (rden) + rdata_r <= ram[raddr]; end end - assign dout = dout_r; + assign rdata = rdata_r; end else begin + `UNUSED_VAR (rden) if (BYTEENW > 1) begin - `USE_FAST_BRAM reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + `USE_FAST_BRAM reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[waddr][i] <= din[i * 8 +: 8]; + ram[waddr][i] <= wdata[i * 8 +: 8]; end end - assign dout = mem[raddr]; + assign rdata = ram[raddr]; end else begin - `USE_FAST_BRAM reg [DATAW-1:0] mem [SIZE-1:0]; + `USE_FAST_BRAM reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[waddr] <= din; + ram[waddr] <= wdata; end - assign dout = mem[raddr]; + assign rdata = ram[raddr]; end end end else begin if (OUTPUT_REG) begin - reg [DATAW-1:0] dout_r; + reg [RD_PORTS-1:0][DATAW-1:0] rdata_r; if (BYTEENW > 1) begin - reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[waddr][i] <= din[i * 8 +: 8]; + ram[waddr][i] <= wdata[i * 8 +: 8]; + end + for (integer i = 0; i < RD_PORTS; ++i) begin + if (rden[i]) + rdata_r[i] <= ram[raddr[i]]; end - dout_r <= mem[raddr]; end end else begin - reg [DATAW-1:0] mem [SIZE-1:0]; + reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[waddr] <= din; - dout_r <= mem[raddr]; + ram[waddr] <= wdata; + for (integer i = 0; i < RD_PORTS; ++i) begin + if (rden[i]) + rdata_r[i] <= ram[raddr[i]]; + end end end - assign dout = dout_r; + assign rdata = rdata_r; end else begin - if (RWCHECK) begin + `UNUSED_VAR (rden) + if (NO_RWCHECK) begin if (BYTEENW > 1) begin - reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + `NO_RW_RAM_CHECK reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[waddr][i] <= din[i * 8 +: 8]; + ram[waddr][i] <= wdata[i * 8 +: 8]; end end - assign dout = mem[raddr]; - end else begin - reg [DATAW-1:0] mem [SIZE-1:0]; - - if (INITZERO) begin - initial mem = '{default: 0}; + for (genvar i = 0; i < RD_PORTS; ++i) begin + assign rdata[i] = ram[raddr[i]]; end + end else begin + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [SIZE-1:0]; + + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[waddr] <= din; + ram[waddr] <= wdata; + end + for (genvar i = 0; i < RD_PORTS; ++i) begin + assign rdata[i] = ram[raddr[i]]; end - assign dout = mem[raddr]; end end else begin if (BYTEENW > 1) begin - `NO_RW_RAM_CHECK reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[waddr][i] <= din[i * 8 +: 8]; + ram[waddr][i] <= wdata[i * 8 +: 8]; end end - assign dout = mem[raddr]; + for (genvar i = 0; i < RD_PORTS; ++i) begin + assign rdata[i] = ram[raddr[i]]; + end end else begin - `NO_RW_RAM_CHECK reg [DATAW-1:0] mem [SIZE-1:0]; + reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[waddr] <= din; + ram[waddr] <= wdata; + end + for (genvar i = 0; i < RD_PORTS; ++i) begin + assign rdata[i] = ram[raddr[i]]; end - assign dout = mem[raddr]; end end end end - +`else + if (OUTPUT_REG) begin + reg [RD_PORTS-1:0][DATAW-1:0] rdata_r; + if (BYTEENW > 1) begin + reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; + + `RAM_INITIALIZATION + + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + ram[waddr][i] <= wdata[i * 8 +: 8]; + end + for (integer i = 0; i < RD_PORTS; ++i) begin + if (rden[i]) + rdata_r[i] <= ram[raddr[i]]; + end + end + end else begin + reg [DATAW-1:0] ram [SIZE-1:0]; + + `RAM_INITIALIZATION + + always @(posedge clk) begin + if (wren) + ram[waddr] <= wdata; + for (integer i = 0; i < RD_PORTS; ++i) begin + if (rden[i]) + rdata_r[i] <= ram[raddr[i]]; + end + end + end + assign rdata = rdata_r; + end else begin + `UNUSED_VAR (rden) + if (BYTEENW > 1) begin + reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_waddr; + reg prev_write; + + `RAM_INITIALIZATION + + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + ram[waddr][i] <= wdata[i * 8 +: 8]; + end + prev_write <= (| wren); + prev_data <= ram[waddr]; + prev_waddr <= waddr; + end + + if (LUTRAM || !NO_RWCHECK) begin + `UNUSED_VAR (prev_write) + `UNUSED_VAR (prev_data) + `UNUSED_VAR (prev_waddr) + for (genvar i = 0; i < RD_PORTS; ++i) begin + assign rdata[i] = ram[raddr[i]]; + end + end else begin + for (genvar i = 0; i < RD_PORTS; ++i) begin + assign rdata[i] = (prev_write && (prev_waddr == raddr[i])) ? prev_data : ram[raddr[i]]; + end + end + end else begin + reg [DATAW-1:0] ram [SIZE-1:0]; + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_waddr; + reg prev_write; + + `RAM_INITIALIZATION + + always @(posedge clk) begin + if (wren) + ram[waddr] <= wdata; + prev_write <= wren; + prev_data <= ram[waddr]; + prev_waddr <= waddr; + end + if (LUTRAM || !NO_RWCHECK) begin + `UNUSED_VAR (prev_write) + `UNUSED_VAR (prev_data) + `UNUSED_VAR (prev_waddr) + for (genvar i = 0; i < RD_PORTS; ++i) begin + assign rdata[i] = ram[raddr[i]]; + end + end else begin + for (genvar i = 0; i < RD_PORTS; ++i) begin + assign rdata[i] = (prev_write && (prev_waddr == raddr[i])) ? prev_data : ram[raddr[i]]; + end + end + end + end +`endif + endmodule `TRACING_ON \ No newline at end of file diff --git a/hw/rtl/libs/VX_elastic_buffer.v b/hw/rtl/libs/VX_elastic_buffer.v index 07d32159..66e8f7ef 100644 --- a/hw/rtl/libs/VX_elastic_buffer.v +++ b/hw/rtl/libs/VX_elastic_buffer.v @@ -5,7 +5,7 @@ module VX_elastic_buffer #( parameter DATAW = 1, parameter SIZE = 2, parameter OUTPUT_REG = 0, - parameter FASTRAM = 0 + parameter LUTRAM = 0 ) ( input wire clk, input wire reset, @@ -56,7 +56,7 @@ module VX_elastic_buffer #( .DATAW (DATAW), .SIZE (SIZE), .OUTPUT_REG (OUTPUT_REG), - .FASTRAM (FASTRAM) + .LUTRAM (LUTRAM) ) queue ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_fifo_queue.v b/hw/rtl/libs/VX_fifo_queue.v index aa4cc539..5a6e63ae 100644 --- a/hw/rtl/libs/VX_fifo_queue.v +++ b/hw/rtl/libs/VX_fifo_queue.v @@ -9,7 +9,7 @@ module VX_fifo_queue #( parameter ADDRW = $clog2(SIZE), parameter SIZEW = $clog2(SIZE+1), parameter OUTPUT_REG = 0, - parameter FASTRAM = 1 + parameter LUTRAM = 1 ) ( input wire clk, input wire reset, @@ -157,15 +157,15 @@ module VX_fifo_queue #( .DATAW (DATAW), .SIZE (SIZE), .OUTPUT_REG (0), - .RWCHECK (1), - .FASTRAM (FASTRAM) + .LUTRAM (LUTRAM) ) dp_ram ( .clk(clk), - .waddr(wr_ptr_r), - .raddr(rd_ptr_r), - .wren(push), - .din(data_in), - .dout(data_out) + .wren (push), + .waddr (wr_ptr_r), + .wdata (data_in), + .rden (1'b1), + .raddr (rd_ptr_r), + .rdata (data_out) ); end else begin @@ -200,15 +200,15 @@ module VX_fifo_queue #( .DATAW (DATAW), .SIZE (SIZE), .OUTPUT_REG (0), - .RWCHECK (1), - .FASTRAM (FASTRAM) + .LUTRAM (LUTRAM) ) dp_ram ( - .clk(clk), - .waddr(wr_ptr_r), - .raddr(rd_ptr_n_r), - .wren(push), - .din(data_in), - .dout(dout) + .clk (clk), + .wren (push), + .waddr (wr_ptr_r), + .wdata (data_in), + .rden (1'b1), + .raddr (rd_ptr_n_r), + .rdata (dout) ); always @(posedge clk) begin diff --git a/hw/rtl/libs/VX_index_buffer.v b/hw/rtl/libs/VX_index_buffer.v index 689485a5..5bf6e514 100644 --- a/hw/rtl/libs/VX_index_buffer.v +++ b/hw/rtl/libs/VX_index_buffer.v @@ -2,10 +2,10 @@ `TRACING_OFF module VX_index_buffer #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter FASTRAM = 1, - parameter ADDRW = `LOG2UP(SIZE) + parameter DATAW = 1, + parameter SIZE = 1, + parameter LUTRAM = 1, + parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, input wire reset, @@ -68,17 +68,17 @@ module VX_index_buffer #( end VX_dp_ram #( - .DATAW(DATAW), - .SIZE(SIZE), - .RWCHECK(1), - .FASTRAM(FASTRAM) + .DATAW (DATAW), + .SIZE (SIZE), + .LUTRAM (LUTRAM) ) data_table ( - .clk(clk), - .waddr(write_addr), - .raddr(read_addr), - .wren(acquire_slot), - .din(write_data), - .dout(read_data) + .clk (clk), + .wren (acquire_slot), + .waddr (write_addr), + .wdata (write_data), + .rden (1'b1), + .raddr (read_addr), + .rdata (read_data) ); assign write_addr = write_addr_r; diff --git a/hw/rtl/libs/VX_sp_ram.v b/hw/rtl/libs/VX_sp_ram.v index 28736f13..65ec0837 100644 --- a/hw/rtl/libs/VX_sp_ram.v +++ b/hw/rtl/libs/VX_sp_ram.v @@ -2,177 +2,259 @@ `TRACING_OFF module VX_sp_ram #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter BYTEENW = 1, - parameter OUTPUT_REG = 0, - parameter RWCHECK = 1, - parameter ADDRW = $clog2(SIZE), - parameter FASTRAM = 0, - parameter INITZERO = 0 -) ( - input wire clk, - input wire [ADDRW-1:0] addr, + parameter DATAW = 1, + parameter SIZE = 1, + parameter BYTEENW = 1, + parameter OUTPUT_REG = 0, + parameter NO_RWCHECK = 0, + parameter ADDRW = $clog2(SIZE), + parameter LUTRAM = 0, + parameter INIT_ENABLE = 0, + parameter INIT_FILE = "", + parameter [DATAW-1:0] INIT_VALUE = 0 +) ( + input wire clk, + input wire [ADDRW-1:0] addr, input wire [BYTEENW-1:0] wren, - input wire [DATAW-1:0] din, - output wire [DATAW-1:0] dout + input wire [DATAW-1:0] wdata, + input wire rden, + output wire [DATAW-1:0] rdata ); `STATIC_ASSERT((1 == BYTEENW) || ((BYTEENW > 1) && 0 == (BYTEENW % 4)), ("invalid parameter")) - if (FASTRAM) begin +`define RAM_INITIALIZATION \ + if (INIT_ENABLE) begin \ + if (INIT_FILE != "") begin \ + initial $readmemh(INIT_FILE, ram); \ + end else begin \ + initial ram = '{default: INIT_VALUE}; \ + end \ + end + +`ifdef SYNTHESIS + if (LUTRAM) begin if (OUTPUT_REG) begin - reg [DATAW-1:0] dout_r; + reg [DATAW-1:0] rdata_r; if (BYTEENW > 1) begin - `USE_FAST_BRAM reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + `USE_FAST_BRAM reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[addr][i] <= din[i * 8 +: 8]; + ram[addr][i] <= wdata[i * 8 +: 8]; end - dout_r <= mem[addr]; + if (rden) + rdata_r <= ram[addr]; end end else begin - `USE_FAST_BRAM reg [DATAW-1:0] mem [SIZE-1:0]; + `USE_FAST_BRAM reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[addr] <= din; - dout_r <= mem[addr]; + ram[addr] <= wdata; + if (rden) + rdata_r <= ram[addr]; end end - assign dout = dout_r; + assign rdata = rdata_r; end else begin + `UNUSED_VAR (rden) if (BYTEENW > 1) begin - `USE_FAST_BRAM reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + `USE_FAST_BRAM reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[addr][i] <= din[i * 8 +: 8]; + ram[addr][i] <= wdata[i * 8 +: 8]; end end - assign dout = mem[addr]; + assign rdata = ram[addr]; end else begin - `USE_FAST_BRAM reg [DATAW-1:0] mem [SIZE-1:0]; + `USE_FAST_BRAM reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[addr] <= din; + ram[addr] <= wdata; end - assign dout = mem[addr]; + assign rdata = ram[addr]; end end end else begin if (OUTPUT_REG) begin - reg [DATAW-1:0] dout_r; + reg [DATAW-1:0] rdata_r; if (BYTEENW > 1) begin - reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[addr][i] <= din[i * 8 +: 8]; + ram[addr][i] <= wdata[i * 8 +: 8]; end - dout_r <= mem[addr]; + if (rden) + rdata_r <= ram[addr]; end end else begin - reg [DATAW-1:0] mem [SIZE-1:0]; + reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[addr] <= din; - dout_r <= mem[addr]; + ram[addr] <= wdata; + if (rden) + rdata_r <= ram[addr]; end end - assign dout = dout_r; + assign rdata = rdata_r; end else begin - if (RWCHECK) begin + `UNUSED_VAR (rden) + if (NO_RWCHECK) begin if (BYTEENW > 1) begin - reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + `NO_RW_RAM_CHECK reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[addr][i] <= din[i * 8 +: 8]; + ram[addr][i] <= wdata[i * 8 +: 8]; end end - assign dout = mem[addr]; + assign rdata = ram[addr]; end else begin - reg [DATAW-1:0] mem [SIZE-1:0]; + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[addr] <= din; + ram[addr] <= wdata; end - assign dout = mem[addr]; + assign rdata = ram[addr]; end end else begin if (BYTEENW > 1) begin - `NO_RW_RAM_CHECK reg [BYTEENW-1:0][7:0] mem [SIZE-1:0]; + reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin for (integer i = 0; i < BYTEENW; i++) begin if (wren[i]) - mem[addr][i] <= din[i * 8 +: 8]; + ram[addr][i] <= wdata[i * 8 +: 8]; end end - assign dout = mem[addr]; + assign rdata = ram[addr]; end else begin - `NO_RW_RAM_CHECK reg [DATAW-1:0] mem [SIZE-1:0]; + reg [DATAW-1:0] ram [SIZE-1:0]; - if (INITZERO) begin - initial mem = '{default: 0}; - end + `RAM_INITIALIZATION always @(posedge clk) begin if (wren) - mem[addr] <= din; + ram[addr] <= wdata; end - assign dout = mem[addr]; + assign rdata = ram[addr]; end end end end - +`else + if (OUTPUT_REG) begin + reg [DATAW-1:0] rdata_r; + if (BYTEENW > 1) begin + reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; + + `RAM_INITIALIZATION + + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + ram[addr][i] <= wdata[i * 8 +: 8]; + end + if (rden) + rdata_r <= ram[addr]; + end + end else begin + reg [DATAW-1:0] ram [SIZE-1:0]; + + `RAM_INITIALIZATION + + always @(posedge clk) begin + if (wren) + ram[addr] <= wdata; + if (rden) + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin + `UNUSED_VAR (rden) + if (BYTEENW > 1) begin + reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_addr; + reg prev_write; + + `RAM_INITIALIZATION + + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + ram[addr][i] <= wdata[i * 8 +: 8]; + end + prev_write <= (| wren); + prev_data <= ram[addr]; + prev_addr <= addr; + end + + if (LUTRAM || !NO_RWCHECK) begin + `UNUSED_VAR (prev_write) + `UNUSED_VAR (prev_data) + `UNUSED_VAR (prev_addr) + assign rdata = ram[addr]; + end else begin + assign rdata = (prev_write && (prev_addr == addr)) ? prev_data : ram[addr]; + end + end else begin + reg [DATAW-1:0] ram [SIZE-1:0]; + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_addr; + reg prev_write; + + `RAM_INITIALIZATION + + always @(posedge clk) begin + if (wren) + ram[addr] <= wdata; + prev_write <= wren; + prev_data <= ram[addr]; + prev_addr <= addr; + end + if (LUTRAM || !NO_RWCHECK) begin + `UNUSED_VAR (prev_write) + `UNUSED_VAR (prev_data) + `UNUSED_VAR (prev_addr) + assign rdata = ram[addr]; + end else begin + assign rdata = (prev_write && (prev_addr == addr)) ? prev_data : ram[addr]; + end + end + end +`endif + endmodule `TRACING_ON \ No newline at end of file From d3d82de29ea92c4f59a48e416394fe8fab4ce966 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 26 Aug 2021 09:49:57 -0700 Subject: [PATCH 05/16] minor update --- hw/rtl/cache/VX_core_req_bank_sel.v | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/hw/rtl/cache/VX_core_req_bank_sel.v b/hw/rtl/cache/VX_core_req_bank_sel.v index 06824c33..2ff9616e 100644 --- a/hw/rtl/cache/VX_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_core_req_bank_sel.v @@ -101,7 +101,7 @@ module VX_core_req_bank_sel #( end end - for (genvar i = NUM_REQS-1; i >= 0; --i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]]); end @@ -186,22 +186,16 @@ module VX_core_req_bank_sel #( if (SHARED_BANK_READY == 0) begin always @(*) begin - core_req_ready_r = 'x; - for (integer i = NUM_REQS-1; i >= 0; --i) begin - if (core_req_valid[i]) begin - core_req_ready_r[i] = per_bank_core_req_ready[core_req_bid[i]] - && core_req_line_match[i]; - end + for (integer i = 0; i < NUM_REQS; ++i) begin + core_req_ready_r[i] = per_bank_core_req_ready[core_req_bid[i]] + && core_req_line_match[i]; end end end else begin always @(*) begin - core_req_ready_r = 'x; - for (integer i = NUM_REQS-1; i >= 0; --i) begin - if (core_req_valid[i]) begin - core_req_ready_r[i] = per_bank_core_req_ready - && core_req_line_match[i]; - end + for (integer i = 0; i < NUM_REQS; ++i) begin + core_req_ready_r[i] = per_bank_core_req_ready + && core_req_line_match[i]; end end end From 74a45e27722a9511dacaec8bf66ef8ccd47a6e85 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 26 Aug 2021 09:52:13 -0700 Subject: [PATCH 06/16] stream arbiter optimization (using indexing instead of onehot mux) --- hw/rtl/libs/VX_stream_arbiter.v | 45 +++++++++++---------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/hw/rtl/libs/VX_stream_arbiter.v b/hw/rtl/libs/VX_stream_arbiter.v index 7b527f55..b5edca24 100644 --- a/hw/rtl/libs/VX_stream_arbiter.v +++ b/hw/rtl/libs/VX_stream_arbiter.v @@ -19,11 +19,12 @@ module VX_stream_arbiter #( output wire [LANES-1:0][DATAW-1:0] data_out, input wire [LANES-1:0] ready_out ); + localparam LOG_NUM_REQS = `CLOG2(NUM_REQS); if (NUM_REQS > 1) begin - wire sel_valid; - wire sel_ready; - wire [NUM_REQS-1:0] sel_1hot; + wire sel_valid; + wire sel_ready; + wire [LOG_NUM_REQS-1:0] sel_index; wire [NUM_REQS-1:0] valid_in_any; wire [LANES-1:0] ready_in_sel; @@ -50,8 +51,8 @@ module VX_stream_arbiter #( .requests (valid_in_any), .enable (sel_ready), .grant_valid (sel_valid), - .grant_onehot (sel_1hot), - `UNUSED_PIN (grant_index) + .grant_index (sel_index), + `UNUSED_PIN (grant_onehot) ); end else if (TYPE == "R") begin VX_rr_arbiter #( @@ -63,8 +64,8 @@ module VX_stream_arbiter #( .requests (valid_in_any), .enable (sel_ready), .grant_valid (sel_valid), - .grant_onehot (sel_1hot), - `UNUSED_PIN (grant_index) + .grant_index (sel_index), + `UNUSED_PIN (grant_onehot) ); end else if (TYPE == "F") begin VX_fair_arbiter #( @@ -76,8 +77,8 @@ module VX_stream_arbiter #( .requests (valid_in_any), .enable (sel_ready), .grant_valid (sel_valid), - .grant_onehot (sel_1hot), - `UNUSED_PIN (grant_index) + .grant_index (sel_index), + `UNUSED_PIN (grant_onehot) ); end else if (TYPE == "M") begin VX_matrix_arbiter #( @@ -89,8 +90,8 @@ module VX_stream_arbiter #( .requests (valid_in_any), .enable (sel_ready), .grant_valid (sel_valid), - .grant_onehot (sel_1hot), - `UNUSED_PIN (grant_index) + .grant_index (sel_index), + `UNUSED_PIN (grant_onehot) ); end else begin $error ("invalid parameter"); @@ -105,32 +106,16 @@ module VX_stream_arbiter #( for (genvar i = 0; i < NUM_REQS; i++) begin assign valid_data_in[i] = {valid_in[i], data_in[i]}; end - - VX_onehot_mux #( - .DATAW (LANES * (1 + DATAW)), - .N (NUM_REQS) - ) data_in_mux ( - .data_in (valid_data_in), - .sel_in (sel_1hot), - .data_out ({valid_in_sel, data_in_sel}) - ); + assign {valid_in_sel, data_in_sel} = valid_data_in[sel_index]; `UNUSED_VAR (sel_valid) end else begin - VX_onehot_mux #( - .DATAW (DATAW), - .N (NUM_REQS) - ) data_in_mux ( - .data_in (data_in), - .sel_in (sel_1hot), - .data_out (data_in_sel) - ); - + assign data_in_sel = data_in[sel_index]; assign valid_in_sel = sel_valid; end for (genvar i = 0; i < NUM_REQS; i++) begin - assign ready_in[i] = ready_in_sel & {LANES{sel_1hot[i]}}; + assign ready_in[i] = ready_in_sel & {LANES{(sel_index == LOG_NUM_REQS'(i))}}; end for (genvar i = 0; i < LANES; ++i) begin From 26e94dde44119c678bf5ca1becae682993d89f3a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 26 Aug 2021 12:27:38 -0700 Subject: [PATCH 07/16] cache area optimization by disabling BRAM read-during-write bypassing for tag/data stores --- hw/rtl/cache/VX_bank.v | 3 +++ hw/rtl/cache/VX_data_access.v | 7 ++++--- hw/rtl/cache/VX_shared_mem.v | 7 ++++--- hw/rtl/cache/VX_tag_access.v | 9 +++++---- hw/syn/opae/fpga_prog.sh | 7 +++++++ 5 files changed, 23 insertions(+), 10 deletions(-) create mode 100755 hw/syn/opae/fpga_prog.sh diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index e1b3a270..6dda9a93 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -161,6 +161,8 @@ module VX_bank #( wire mreq_alm_full; wire creq_fire = creq_valid && creq_ready; + + wire fill_in_st0 = valid_st0 && is_fill_st0; // determine which queue to pop next in priority order wire mshr_grant = 1; @@ -172,6 +174,7 @@ module VX_bank #( wire creq_grant = !mshr_enable && !mrsq_enable && !flush_enable; wire mshr_ready = mshr_grant + && !fill_in_st0 // prevent tag read-during-write with fill && !crsq_stall; // ensure core response ready assign mem_rsp_ready = mrsq_grant diff --git a/hw/rtl/cache/VX_data_access.v b/hw/rtl/cache/VX_data_access.v index f3700e5f..64541f7b 100644 --- a/hw/rtl/cache/VX_data_access.v +++ b/hw/rtl/cache/VX_data_access.v @@ -63,9 +63,10 @@ module VX_data_access #( end VX_sp_ram #( - .DATAW (CACHE_LINE_SIZE * 8), - .SIZE (`LINES_PER_BANK), - .BYTEENW (BYTEENW) + .DATAW (CACHE_LINE_SIZE * 8), + .SIZE (`LINES_PER_BANK), + .BYTEENW (BYTEENW), + .NO_RWCHECK (1) ) data_store ( .clk (clk), .addr (line_addr), diff --git a/hw/rtl/cache/VX_shared_mem.v b/hw/rtl/cache/VX_shared_mem.v index 71a49b3c..00edeab0 100644 --- a/hw/rtl/cache/VX_shared_mem.v +++ b/hw/rtl/cache/VX_shared_mem.v @@ -171,9 +171,10 @@ module VX_shared_mem #( && creq_out_fire; VX_sp_ram #( - .DATAW (`WORD_WIDTH), - .SIZE (`LINES_PER_BANK), - .BYTEENW (WORD_SIZE) + .DATAW (`WORD_WIDTH), + .SIZE (`LINES_PER_BANK), + .BYTEENW (WORD_SIZE), + .NO_RWCHECK (1) ) data_store ( .clk (clk), .addr (per_bank_core_req_addr[i]), diff --git a/hw/rtl/cache/VX_tag_access.v b/hw/rtl/cache/VX_tag_access.v index 0c51de01..708220ae 100644 --- a/hw/rtl/cache/VX_tag_access.v +++ b/hw/rtl/cache/VX_tag_access.v @@ -46,10 +46,11 @@ module VX_tag_access #( wire [`LINE_SELECT_BITS-1:0] line_addr = addr [`LINE_SELECT_BITS-1:0]; VX_sp_ram #( - .DATAW(`TAG_SELECT_BITS + 1), - .SIZE(`LINES_PER_BANK), - .INIT_ENABLE(1), - .INIT_VALUE(0) + .DATAW (`TAG_SELECT_BITS + 1), + .SIZE (`LINES_PER_BANK), + .INIT_ENABLE (1), + .INIT_VALUE (0), + .NO_RWCHECK (1) ) tag_store ( .clk( clk), .addr (line_addr), diff --git a/hw/syn/opae/fpga_prog.sh b/hw/syn/opae/fpga_prog.sh new file mode 100755 index 00000000..4fc9db22 --- /dev/null +++ b/hw/syn/opae/fpga_prog.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# FPGA programming +# first argument is the bitstream + +echo "fpgaconf --bus 0xaf $1" +fpgaconf --bus 0xaf $1 \ No newline at end of file From 28eb3cfdb2273804603ca4ae84e2279724a500b9 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 26 Aug 2021 14:49:57 -0700 Subject: [PATCH 08/16] minor update --- hw/rtl/libs/VX_priority_encoder.v | 88 +++++++++++++------------------ 1 file changed, 37 insertions(+), 51 deletions(-) diff --git a/hw/rtl/libs/VX_priority_encoder.v b/hw/rtl/libs/VX_priority_encoder.v index e68fc474..15968a7c 100644 --- a/hw/rtl/libs/VX_priority_encoder.v +++ b/hw/rtl/libs/VX_priority_encoder.v @@ -12,46 +12,48 @@ module VX_priority_encoder #( output wire [LN-1:0] index, output wire valid_out ); + wire [N-1:0] reversed; + + if (REVERSE) begin + for (genvar i = 0; i < N; ++i) begin + assign reversed[N-i-1] = data_in[i]; + end + end else begin + assign reversed = data_in; + end if (N == 1) begin - assign onehot = data_in; + assign onehot = reversed; assign index = 0; - assign valid_out = data_in; + assign valid_out = reversed; end else if (N == 2) begin - assign onehot = {~data_in[REVERSE], data_in[REVERSE]}; - assign index = ~data_in[REVERSE]; - assign valid_out = (| data_in); + assign onehot = {~reversed[0], reversed[0]}; + assign index = ~reversed[0]; + assign valid_out = (| reversed); end else if (MODEL == 1) begin wire [N-1:0] scan_lo; VX_scan #( - .N (N), - .OP (2), - .REVERSE (REVERSE) + .N (N), + .OP (2) ) scan ( - .data_in (data_in), + .data_in (reversed), .data_out (scan_lo) ); - if (REVERSE) begin - assign onehot = scan_lo & {1'b1, (~scan_lo[N-1:1])}; - assign valid_out = scan_lo[0]; - end else begin - assign onehot = scan_lo & {(~scan_lo[N-2:0]), 1'b1}; - assign valid_out = scan_lo[N-1]; - end + assign onehot = scan_lo & {(~scan_lo[N-2:0]), 1'b1}; + assign valid_out = scan_lo[N-1]; VX_onehot_encoder #( - .N (N), - .REVERSE (REVERSE) + .N (N) ) onehot_encoder ( .data_in (onehot), - .data_out (index), + .data_out (index), `UNUSED_PIN (valid_out) ); @@ -60,70 +62,54 @@ module VX_priority_encoder #( `IGNORE_WARNINGS_BEGIN wire [N-1:0] higher_pri_regs; `IGNORE_WARNINGS_END - assign higher_pri_regs[N-1:1] = higher_pri_regs[N-2:0] | data_in[N-2:0]; + assign higher_pri_regs[N-1:1] = higher_pri_regs[N-2:0] | reversed[N-2:0]; assign higher_pri_regs[0] = 1'b0; - assign onehot[N-1:0] = data_in[N-1:0] & ~higher_pri_regs[N-1:0]; + assign onehot[N-1:0] = reversed[N-1:0] & ~higher_pri_regs[N-1:0]; VX_onehot_encoder #( - .N (N), - .REVERSE (REVERSE) + .N (N) ) onehot_encoder ( .data_in (onehot), .data_out (index), `UNUSED_PIN (valid_out) ); - assign valid_out = (| data_in); + assign valid_out = (| reversed); end else if (MODEL == 3) begin - assign onehot = data_in & ~(data_in-1); + assign onehot = reversed & ~(reversed-1); VX_onehot_encoder #( - .N (N), - .REVERSE (REVERSE) + .N (N) ) onehot_encoder ( .data_in (onehot), .data_out (index), `UNUSED_PIN (valid_out) ); - assign valid_out = (| data_in); + assign valid_out = (| reversed); end else begin reg [LN-1:0] index_r; reg [N-1:0] onehot_r; - if (REVERSE) begin - always @(*) begin - index_r = 'x; - onehot_r = 'x; - for (integer i = 0; i < N; ++i) begin - if (data_in[i]) begin - index_r = LN'(i); - onehot_r = 0; - onehot_r[i] = 1'b1; - end - end - end - end else begin - always @(*) begin - index_r = 'x; - onehot_r = 'x; - for (integer i = N-1; i >= 0; --i) begin - if (data_in[i]) begin - index_r = LN'(i); - onehot_r = 0; - onehot_r[i] = 1'b1; - end + always @(*) begin + index_r = 'x; + onehot_r = 'x; + for (integer i = N-1; i >= 0; --i) begin + if (reversed[i]) begin + index_r = LN'(i); + onehot_r = 0; + onehot_r[i] = 1'b1; end end end assign index = index_r; assign onehot = onehot_r; - assign valid_out = (| data_in); + assign valid_out = (| reversed); end From 12b8b4af240b7de5cbc42b5b6eccc6f216b69712 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 28 Aug 2021 15:21:40 -0700 Subject: [PATCH 09/16] minor updates --- ci/regression.sh | 5 +++-- hw/rtl/VX_cluster.v | 1 + hw/rtl/VX_decode.v | 3 ++- hw/rtl/VX_define.vh | 4 ++++ hw/rtl/VX_print_instr.vh | 30 +++++++++++++++++++----------- tests/regression/dogfood/Makefile | 2 +- tests/regression/dogfood/main.cpp | 1 + tests/riscv/isa/Makefile | 3 ++- 8 files changed, 33 insertions(+), 16 deletions(-) diff --git a/ci/regression.sh b/ci/regression.sh index 72e076ec..11f5d21a 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -77,8 +77,9 @@ CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo # test cache multi-porting -CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo -CONFIGS="-DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo +CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr +CONFIGS="-DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr +CONFIGS="-DL2NUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr # test 128-bit MEM block CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 2414828e..64f74ae3 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -87,6 +87,7 @@ module VX_cluster #( .CACHE_SIZE (`L2CACHE_SIZE), .CACHE_LINE_SIZE (`L2CACHE_LINE_SIZE), .NUM_BANKS (`L2NUM_BANKS), + .NUM_PORTS (`L2NUM_PORTS), .WORD_SIZE (`L2WORD_SIZE), .NUM_REQS (`L2NUM_REQS), .CREQ_SIZE (`L2CREQ_SIZE), diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 0c90e2b5..eac4e60e 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -192,7 +192,8 @@ module VX_decode #( end `INST_F: begin ex_type = `EX_LSU; - op_mod = `MOD_BITS'(!func3[0]); // data fence + op_type = `OP_BITS'(func3[0]); + op_mod = `MOD_BITS'(1); end `INST_SYS : begin if (func3[1:0] != 0) begin diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index c0e9e530..94287911 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -154,6 +154,10 @@ `define LSU_OP(x) x[`LSU_BITS-1:0] `define LSU_IS_FENCE(x) x[0] +`define FENCE_BITS 1 +`define FENCE_D 1'h0 +`define FENCE_I 1'h1 + `define CSR_RW 2'h1 `define CSR_RS 2'h2 `define CSR_RC 2'h3 diff --git a/hw/rtl/VX_print_instr.vh b/hw/rtl/VX_print_instr.vh index 66e19c08..2931bc9f 100644 --- a/hw/rtl/VX_print_instr.vh +++ b/hw/rtl/VX_print_instr.vh @@ -71,17 +71,25 @@ task print_ex_op ( end end `EX_LSU: begin - case (`LSU_BITS'(op_type)) - `LSU_LB: dpi_trace("LB"); - `LSU_LH: dpi_trace("LH"); - `LSU_LW: dpi_trace("LW"); - `LSU_LBU:dpi_trace("LBU"); - `LSU_LHU:dpi_trace("LHU"); - `LSU_SB: dpi_trace("SB"); - `LSU_SH: dpi_trace("SH"); - `LSU_SW: dpi_trace("SW"); - default: dpi_trace("?"); - endcase + if (op_mod == 0) begin + case (`LSU_BITS'(op_type)) + `LSU_LB: dpi_trace("LB"); + `LSU_LH: dpi_trace("LH"); + `LSU_LW: dpi_trace("LW"); + `LSU_LBU:dpi_trace("LBU"); + `LSU_LHU:dpi_trace("LHU"); + `LSU_SB: dpi_trace("SB"); + `LSU_SH: dpi_trace("SH"); + `LSU_SW: dpi_trace("SW"); + default: dpi_trace("?"); + endcase + end else if (op_mod == 1) begin + case (`FENCE_BITS'(op_type)) + `FENCE_D: dpi_trace("DFENCE"); + `FENCE_I: dpi_trace("IFENCE"); + default: dpi_trace("?"); + endcase + end end `EX_CSR: begin case (`CSR_BITS'(op_type)) diff --git a/tests/regression/dogfood/Makefile b/tests/regression/dogfood/Makefile index 2f89afc6..dce752d4 100644 --- a/tests/regression/dogfood/Makefile +++ b/tests/regression/dogfood/Makefile @@ -21,7 +21,7 @@ VX_SRCS = kernel.c #CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors -CXXFLAGS += -I$(VORTEX_DRV_PATH)/include +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex diff --git a/tests/regression/dogfood/main.cpp b/tests/regression/dogfood/main.cpp index aab15522..804609ae 100644 --- a/tests/regression/dogfood/main.cpp +++ b/tests/regression/dogfood/main.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "testcases.h" #include "common.h" diff --git a/tests/riscv/isa/Makefile b/tests/riscv/isa/Makefile index f7488899..4684911d 100644 --- a/tests/riscv/isa/Makefile +++ b/tests/riscv/isa/Makefile @@ -1,8 +1,9 @@ ALL_TESTS := $(wildcard *.hex) +D_TESTS := $(wildcard *ud-p-*.hex) V_TESTS := $(wildcard *-v-*.hex) -EXCLUDED_TESTS := $(V_TESTS) rv32si-p-scall.hex rv32si-p-sbreak.hex rv32mi-p-breakpoint.hex rv32ud-p-fclass.hex rv32ua-p-amomax_w.hex rv32ua-p-amoxor_w.hex rv32ud-p-ldst.hex rv32ua-p-amoor_w.hex rv32mi-p-ma_addr.hex rv32ud-p-fdiv.hex rv32ud-p-fcmp.hex rv32mi-p-mcsr.hex rv32ua-p-amoswap_w.hex rv32mi-p-ma_fetch.hex rv32mi-p-csr.hex rv32ua-p-amoadd_w.hex rv32si-p-dirty.hex rv32ud-p-fcvt.hex rv32ui-p-fence_i.hex rv32si-p-csr.hex rv32mi-p-shamt.hex rv32ua-p-amomin_w.hex rv32ua-p-lrsc.hex rv32ud-p-fmadd.hex rv32ud-p-fadd.hex rv32si-p-wfi.hex rv32ua-p-amomaxu_w.hex rv32si-p-ma_fetch.hex rv32ud-p-fmin.hex rv32mi-p-illegal.hex rv32uc-p-rvc.hex rv32mi-p-sbreak.hex rv32ua-p-amominu_w.hex rv32ua-p-amoand_w.hex +EXCLUDED_TESTS := $(V_TESTS) $(D_TESTS) rv32si-p-scall.hex rv32si-p-sbreak.hex rv32mi-p-breakpoint.hex rv32ua-p-amomax_w.hex rv32ua-p-amoxor_w.hex rv32ua-p-amoor_w.hex rv32mi-p-ma_addr.hex rv32mi-p-mcsr.hex rv32ua-p-amoswap_w.hex rv32mi-p-ma_fetch.hex rv32mi-p-csr.hex rv32ua-p-amoadd_w.hex rv32si-p-dirty.hex rv32ui-p-fence_i.hex rv32si-p-csr.hex rv32mi-p-shamt.hex rv32ua-p-amomin_w.hex rv32ua-p-lrsc.hex rv32si-p-wfi.hex rv32ua-p-amomaxu_w.hex rv32si-p-ma_fetch.hex rv32mi-p-illegal.hex rv32uc-p-rvc.hex rv32mi-p-sbreak.hex rv32ua-p-amominu_w.hex rv32ua-p-amoand_w.hex TESTS := $(filter-out $(EXCLUDED_TESTS), $(ALL_TESTS)) From f3ba27b138a154fb68525bfa3339007f9bd46f0b Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 28 Aug 2021 15:34:36 -0700 Subject: [PATCH 10/16] GPRs optimization - disabling BRAM's read-during-write bypass block. --- hw/rtl/VX_gpr_stage.v | 135 ++++++++++++++++++++++++---------------- hw/rtl/libs/VX_dp_ram.v | 79 ++++++++--------------- 2 files changed, 108 insertions(+), 106 deletions(-) diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 56f484bc..17bc317c 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -17,71 +17,100 @@ module VX_gpr_stage #( `UNUSED_PARAM (CORE_ID) `UNUSED_VAR (reset) + localparam RAM_SIZE = `NUM_WARPS * `NUM_REGS; + // ensure r0 never gets written, which can happen before the reset wire write_enable = writeback_if.valid && (writeback_if.rd != 0); -`ifdef EXT_F_ENABLE - localparam RAM_SIZE = `NUM_WARPS * `NUM_REGS; - wire [`NUM_THREADS-1:0][31:0] rdata1, rdata2, rdata3; - wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2, raddr3; - - assign waddr = {writeback_if.wid, writeback_if.rd}; - assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1}; - assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2}; - assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3}; + wire [(`NUM_THREADS * 4)-1:0] wren; + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + assign wren [i * 4 +: 4] = {4{write_enable && writeback_if.tmask[i]}}; + end - for (genvar i = 0; i < `NUM_THREADS; i++) begin - VX_dp_ram #( - .RD_PORTS (3), - .DATAW (32), - .SIZE (RAM_SIZE), - .INIT_ENABLE (1), - .INIT_VALUE (0) - ) dp_ram ( - .clk (clk), - .wren (write_enable && writeback_if.tmask[i]), - .waddr (waddr), - .wdata (writeback_if.data[i]), - .rden (3'b111), - .raddr ({raddr3, raddr2, raddr1}), - .rdata ({rdata3[i], rdata2[i], rdata1[i]}) - ); + reg [`NUM_THREADS-1:0][31:0] last_wdata; + reg [$clog2(RAM_SIZE)-1:0] last_waddr; + reg [`NUM_THREADS-1:0] last_wmask; + + always @(posedge clk) begin + last_wdata <= writeback_if.data; + last_wmask <= {`NUM_THREADS{write_enable}} & writeback_if.tmask; + last_waddr <= waddr; end - assign gpr_rsp_if.rs1_data = rdata1; - assign gpr_rsp_if.rs2_data = rdata2; - assign gpr_rsp_if.rs3_data = rdata3; -`else - localparam RAM_SIZE = `NUM_WARPS * `NUM_REGS; wire [`NUM_THREADS-1:0][31:0] rdata1, rdata2; wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2; - + assign waddr = {writeback_if.wid, writeback_if.rd}; - assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1}; - assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2}; - `UNUSED_VAR (gpr_req_if.rs3) + assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1}; + assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2}; - for (genvar i = 0; i < `NUM_THREADS; i++) begin - VX_dp_ram #( - .RD_PORTS (2), - .DATAW (32), - .SIZE (RAM_SIZE), - .INIT_ENABLE (1), - .INIT_VALUE (0) - ) dp_ram ( - .clk (clk), - .wren (write_enable && writeback_if.tmask[i]), - .waddr (waddr), - .wdata (writeback_if.data[i]), - .rden (2'b11), - .raddr ({raddr2, raddr1}), - .rdata ({rdata2[i], rdata1[i]}) - ); + VX_dp_ram #( + .DATAW (32 * `NUM_THREADS), + .SIZE (RAM_SIZE), + .BYTEENW (`NUM_THREADS * 4), + .INIT_ENABLE (1), + .INIT_VALUE (0), + .NO_RWCHECK (1) + ) dp_ram1 ( + .clk (clk), + .wren (wren), + .waddr (waddr), + .wdata (writeback_if.data), + .rden (1'b1), + .raddr (raddr1), + .rdata (rdata1) + ); + + VX_dp_ram #( + .DATAW (32 * `NUM_THREADS), + .SIZE (RAM_SIZE), + .BYTEENW (`NUM_THREADS * 4), + .INIT_ENABLE (1), + .INIT_VALUE (0), + .NO_RWCHECK (1) + ) dp_ram2 ( + .clk (clk), + .wren (wren), + .waddr (waddr), + .wdata (writeback_if.data), + .rden (1'b1), + .raddr (raddr2), + .rdata (rdata2) + ); + + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + assign gpr_rsp_if.rs1_data[i] = (last_wmask[i] && (raddr1 == last_waddr)) ? last_wdata[i] : rdata1[i]; + assign gpr_rsp_if.rs2_data[i] = (last_wmask[i] && (raddr2 == last_waddr)) ? last_wdata[i] : rdata2[i]; end + +`ifdef EXT_F_ENABLE + wire [`NUM_THREADS-1:0][31:0] rdata3; + wire [$clog2(RAM_SIZE)-1:0] raddr3; + assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3}; - assign gpr_rsp_if.rs1_data = rdata1; - assign gpr_rsp_if.rs2_data = rdata2; - assign gpr_rsp_if.rs3_data = 0; + VX_dp_ram #( + .DATAW (32 * `NUM_THREADS), + .SIZE (RAM_SIZE), + .BYTEENW (`NUM_THREADS * 4), + .INIT_ENABLE (1), + .INIT_VALUE (0), + .NO_RWCHECK (1) + ) dp_ram3 ( + .clk (clk), + .wren (wren), + .waddr (waddr), + .wdata (writeback_if.data), + .rden (1'b1), + .raddr (raddr3), + .rdata (rdata3) + ); + + for (genvar i = 0; i < `NUM_THREADS; i++) begin + assign gpr_rsp_if.rs3_data[i] = (last_wmask[i] && (raddr3 == last_waddr)) ? last_wdata[i] : rdata3[i]; + end +`else + `UNUSED_VAR (gpr_req_if.rs3) + assign gpr_rsp_if.rs3_data = 'x; `endif assign writeback_if.ready = 1'b1; diff --git a/hw/rtl/libs/VX_dp_ram.v b/hw/rtl/libs/VX_dp_ram.v index e1d6defa..db8e99b8 100644 --- a/hw/rtl/libs/VX_dp_ram.v +++ b/hw/rtl/libs/VX_dp_ram.v @@ -2,7 +2,6 @@ `TRACING_OFF module VX_dp_ram #( - parameter RD_PORTS = 1, parameter DATAW = 1, parameter SIZE = 1, parameter BYTEENW = 1, @@ -14,18 +13,16 @@ module VX_dp_ram #( parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0 ) ( - input wire clk, - input wire [BYTEENW-1:0] wren, - input wire [ADDRW-1:0] waddr, - input wire [DATAW-1:0] wdata, - input wire [RD_PORTS-1:0] rden, - input wire [RD_PORTS-1:0][ADDRW-1:0] raddr, - output wire [RD_PORTS-1:0][DATAW-1:0] rdata + input wire clk, + input wire [BYTEENW-1:0] wren, + input wire [ADDRW-1:0] waddr, + input wire [DATAW-1:0] wdata, + input wire rden, + input wire [ADDRW-1:0] raddr, + output wire [DATAW-1:0] rdata ); `STATIC_ASSERT((1 == BYTEENW) || ((BYTEENW > 1) && 0 == (BYTEENW % 4)), ("invalid parameter")) - `STATIC_ASSERT(!LUTRAM || (RD_PORTS == 1), ("multi-porting not supported on LUTRAM")) - `define RAM_INITIALIZATION \ if (INIT_ENABLE) begin \ @@ -94,7 +91,7 @@ module VX_dp_ram #( end end else begin if (OUTPUT_REG) begin - reg [RD_PORTS-1:0][DATAW-1:0] rdata_r; + reg [DATAW-1:0] rdata_r; if (BYTEENW > 1) begin reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; @@ -106,10 +103,8 @@ module VX_dp_ram #( if (wren[i]) ram[waddr][i] <= wdata[i * 8 +: 8]; end - for (integer i = 0; i < RD_PORTS; ++i) begin - if (rden[i]) - rdata_r[i] <= ram[raddr[i]]; - end + if (rden) + rdata_r <= ram[raddr]; end end else begin reg [DATAW-1:0] ram [SIZE-1:0]; @@ -119,10 +114,8 @@ module VX_dp_ram #( always @(posedge clk) begin if (wren) ram[waddr] <= wdata; - for (integer i = 0; i < RD_PORTS; ++i) begin - if (rden[i]) - rdata_r[i] <= ram[raddr[i]]; - end + if (rden) + rdata_r <= ram[raddr]; end end assign rdata = rdata_r; @@ -140,9 +133,7 @@ module VX_dp_ram #( ram[waddr][i] <= wdata[i * 8 +: 8]; end end - for (genvar i = 0; i < RD_PORTS; ++i) begin - assign rdata[i] = ram[raddr[i]]; - end + assign rdata = ram[raddr]; end else begin `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [SIZE-1:0]; @@ -152,9 +143,7 @@ module VX_dp_ram #( if (wren) ram[waddr] <= wdata; end - for (genvar i = 0; i < RD_PORTS; ++i) begin - assign rdata[i] = ram[raddr[i]]; - end + assign rdata = ram[raddr]; end end else begin if (BYTEENW > 1) begin @@ -168,9 +157,7 @@ module VX_dp_ram #( ram[waddr][i] <= wdata[i * 8 +: 8]; end end - for (genvar i = 0; i < RD_PORTS; ++i) begin - assign rdata[i] = ram[raddr[i]]; - end + assign rdata = ram[raddr]; end else begin reg [DATAW-1:0] ram [SIZE-1:0]; @@ -180,16 +167,14 @@ module VX_dp_ram #( if (wren) ram[waddr] <= wdata; end - for (genvar i = 0; i < RD_PORTS; ++i) begin - assign rdata[i] = ram[raddr[i]]; - end + assign rdata = ram[raddr]; end end end end `else if (OUTPUT_REG) begin - reg [RD_PORTS-1:0][DATAW-1:0] rdata_r; + reg [DATAW-1:0] rdata_r; if (BYTEENW > 1) begin reg [BYTEENW-1:0][7:0] ram [SIZE-1:0]; @@ -200,10 +185,8 @@ module VX_dp_ram #( if (wren[i]) ram[waddr][i] <= wdata[i * 8 +: 8]; end - for (integer i = 0; i < RD_PORTS; ++i) begin - if (rden[i]) - rdata_r[i] <= ram[raddr[i]]; - end + if (rden) + rdata_r <= ram[raddr]; end end else begin reg [DATAW-1:0] ram [SIZE-1:0]; @@ -213,10 +196,8 @@ module VX_dp_ram #( always @(posedge clk) begin if (wren) ram[waddr] <= wdata; - for (integer i = 0; i < RD_PORTS; ++i) begin - if (rden[i]) - rdata_r[i] <= ram[raddr[i]]; - end + if (rden) + rdata_r <= ram[raddr]; end end assign rdata = rdata_r; @@ -244,13 +225,9 @@ module VX_dp_ram #( `UNUSED_VAR (prev_write) `UNUSED_VAR (prev_data) `UNUSED_VAR (prev_waddr) - for (genvar i = 0; i < RD_PORTS; ++i) begin - assign rdata[i] = ram[raddr[i]]; - end + assign rdata = ram[raddr]; end else begin - for (genvar i = 0; i < RD_PORTS; ++i) begin - assign rdata[i] = (prev_write && (prev_waddr == raddr[i])) ? prev_data : ram[raddr[i]]; - end + assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; end end else begin reg [DATAW-1:0] ram [SIZE-1:0]; @@ -271,17 +248,13 @@ module VX_dp_ram #( `UNUSED_VAR (prev_write) `UNUSED_VAR (prev_data) `UNUSED_VAR (prev_waddr) - for (genvar i = 0; i < RD_PORTS; ++i) begin - assign rdata[i] = ram[raddr[i]]; - end + assign rdata = ram[raddr]; end else begin - for (genvar i = 0; i < RD_PORTS; ++i) begin - assign rdata[i] = (prev_write && (prev_waddr == raddr[i])) ? prev_data : ram[raddr[i]]; - end + assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; end end end -`endif +`endif endmodule `TRACING_ON \ No newline at end of file From 6674e8c44a74dd314c8ce580a68bc4acf0faa38b Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 28 Aug 2021 21:34:06 -0700 Subject: [PATCH 11/16] cache bank area optimization + multi-porting fix for l2/l3 caches --- hw/rtl/VX_config.vh | 12 +- hw/rtl/Vortex.v | 1 + hw/rtl/cache/VX_bank.v | 146 +++++++++-------- hw/rtl/cache/VX_cache.v | 90 +++++++---- hw/rtl/cache/VX_cache_define.vh | 6 +- hw/rtl/cache/VX_core_req_bank_sel.v | 11 +- hw/rtl/cache/VX_core_rsp_merge.v | 234 ++++++++++++++++++++-------- hw/rtl/cache/VX_nc_bypass.v | 122 +++++++-------- hw/rtl/cache/VX_tag_access.v | 2 - 9 files changed, 388 insertions(+), 236 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index edc3e37e..5124116e 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -291,7 +291,7 @@ `define DNUM_BANKS `NUM_THREADS `endif -// Number of bank ports +// Number of ports per bank `ifndef DNUM_PORTS `define DNUM_PORTS 1 `endif @@ -361,6 +361,11 @@ `define L2NUM_BANKS `MIN(`NUM_CORES, 4) `endif +// Number of ports per bank +`ifndef L2NUM_PORTS +`define L2NUM_PORTS 1 +`endif + // Core Request Queue Size `ifndef L2CREQ_SIZE `define L2CREQ_SIZE 0 @@ -398,6 +403,11 @@ `define L3NUM_BANKS `MIN(`NUM_CLUSTERS, 4) `endif +// Number of ports per bank +`ifndef L3NUM_PORTS +`define L3NUM_PORTS 1 +`endif + // Core Request Queue Size `ifndef L3CREQ_SIZE `define L3CREQ_SIZE 0 diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 4d871e0e..f1be995d 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -85,6 +85,7 @@ module Vortex ( .CACHE_SIZE (`L3CACHE_SIZE), .CACHE_LINE_SIZE (`L3CACHE_LINE_SIZE), .NUM_BANKS (`L3NUM_BANKS), + .NUM_PORTS (`L3NUM_PORTS), .WORD_SIZE (`L3WORD_SIZE), .NUM_REQS (`L3NUM_REQS), .CREQ_SIZE (`L3CREQ_SIZE), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 6dda9a93..933f189e 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -39,7 +39,8 @@ module VX_bank #( // bank offset from beginning of index range parameter BANK_ADDR_OFFSET = 0, - localparam MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE) + localparam MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE), + localparam WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS) ) ( `SCOPE_IO_VX_bank @@ -56,13 +57,13 @@ module VX_bank #( // Core Request input wire core_req_valid, input wire [NUM_PORTS-1:0] core_req_pmask, - input wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] core_req_wsel, + input wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] core_req_wsel, input wire [NUM_PORTS-1:0][WORD_SIZE-1:0] core_req_byteen, input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_req_data, input wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_req_tid, + input wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, input wire core_req_rw, input wire [`LINE_ADDR_WIDTH-1:0] core_req_addr, - input wire [CORE_TAG_WIDTH-1:0] core_req_tag, output wire core_req_ready, // Core Response @@ -70,16 +71,17 @@ module VX_bank #( output wire [NUM_PORTS-1:0] core_rsp_pmask, output wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_rsp_tid, output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_rsp_data, - output wire [CORE_TAG_WIDTH-1:0] core_rsp_tag, + output wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, input wire core_rsp_ready, // Memory request output wire mem_req_valid, output wire mem_req_rw, - output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen, + output wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen, + output wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel, output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr, output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, - output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data, + output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data, input wire mem_req_ready, // Memory response @@ -104,18 +106,18 @@ module VX_bank #( `endif wire [NUM_PORTS-1:0] creq_pmask; - wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] creq_wsel; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel; wire [NUM_PORTS-1:0][WORD_SIZE-1:0] creq_byteen; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] creq_tid; + wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] creq_tag; wire creq_rw; wire [`LINE_ADDR_WIDTH-1:0] creq_addr; - wire [CORE_TAG_WIDTH-1:0] creq_tag; - + wire creq_valid, creq_ready; VX_elastic_buffer #( - .DATAW (CORE_TAG_WIDTH + 1 + `LINE_ADDR_WIDTH + (1 + `UP(`WORD_SELECT_BITS) + WORD_SIZE + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), + .DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)), .SIZE (CREQ_SIZE), .OUTPUT_REG (CREQ_SIZE > 2) ) core_req_queue ( @@ -123,8 +125,8 @@ module VX_bank #( .reset (reset), .ready_in (core_req_ready), .valid_in (core_req_valid), - .data_in ({core_req_tag, core_req_rw, core_req_addr, core_req_pmask, core_req_wsel, core_req_byteen, core_req_data, core_req_tid}), - .data_out ({creq_tag, creq_rw, creq_addr, creq_pmask, creq_wsel, creq_byteen, creq_data, creq_tid}), + .data_in ({core_req_rw, core_req_addr, core_req_pmask, core_req_wsel, core_req_byteen, core_req_data, core_req_tid, core_req_tag}), + .data_out ({creq_rw, creq_addr, creq_pmask, creq_wsel, creq_byteen, creq_data, creq_tid, creq_tag}), .ready_out (creq_ready), .valid_out (creq_valid) ); @@ -134,35 +136,33 @@ module VX_bank #( wire mshr_valid; wire [MSHR_ADDR_WIDTH-1:0] mshr_dequeue_id; wire [`LINE_ADDR_WIDTH-1:0] mshr_addr; - wire [CORE_TAG_WIDTH-1:0] mshr_tag; - wire [NUM_PORTS-1:0] mshr_pmask; - wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] mshr_wsel; + wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] mshr_tag; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mshr_wsel; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] mshr_tid; + wire [NUM_PORTS-1:0] mshr_pmask; wire [`LINE_ADDR_WIDTH-1:0] addr_st0, addr_st1; - wire mem_rw_st0, mem_rw_st1; - wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] wsel_st0, wsel_st1; + wire write_st0, write_st1; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] wsel_st0, wsel_st1; wire [NUM_PORTS-1:0][WORD_SIZE-1:0] byteen_st0, byteen_st1; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] req_tid_st0, req_tid_st1; wire [NUM_PORTS-1:0] pmask_st0, pmask_st1; + wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] tag_st0, tag_st1; wire [`CACHE_LINE_WIDTH-1:0] rdata_st1; wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1; wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1; - wire [CORE_TAG_WIDTH-1:0] tag_st0, tag_st1; wire valid_st0, valid_st1; wire is_fill_st0, is_fill_st1; wire is_mshr_st0, is_mshr_st1; wire miss_st0, miss_st1; - wire writeen_unqual_st1; wire is_flush_st0; wire mshr_pending_st0, mshr_pending_st1; wire crsq_valid, crsq_ready, crsq_stall; wire mreq_alm_full; - - wire creq_fire = creq_valid && creq_ready; - - wire fill_in_st0 = valid_st0 && is_fill_st0; + + wire rdw_fill_hazard = valid_st0 && is_fill_st0; + wire rdw_write_hazard = valid_st0 && write_st0 && ~creq_rw; // determine which queue to pop next in priority order wire mshr_grant = 1; @@ -174,24 +174,25 @@ module VX_bank #( wire creq_grant = !mshr_enable && !mrsq_enable && !flush_enable; wire mshr_ready = mshr_grant - && !fill_in_st0 // prevent tag read-during-write with fill - && !crsq_stall; // ensure core response ready + && !rdw_fill_hazard // prevent read-during-write + && !crsq_stall; // ensure core response ready assign mem_rsp_ready = mrsq_grant - && !crsq_stall; // ensure core response ready + && !crsq_stall; // ensure core response ready - assign creq_ready = creq_grant - && !mreq_alm_full // ensure memory request ready - && !mshr_alm_full // ensure mshr enqueue ready - && !crsq_stall; // ensure core response ready - - wire mshr_fire = mshr_valid && mshr_ready; + assign creq_ready = creq_grant + && !rdw_write_hazard // prevent read-during-write + && !mreq_alm_full // ensure memory request ready + && !mshr_alm_full // ensure mshr enqueue ready + && !crsq_stall; // ensure core response ready + wire mshr_fire = mshr_valid && mshr_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; + wire creq_fire = creq_valid && creq_ready; `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[`CACHE_REQ_INFO_RNG] : creq_tag[`CACHE_REQ_INFO_RNG]; + assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[0][`CACHE_REQ_INFO_RNG] : creq_tag[0][`CACHE_REQ_INFO_RNG]; end else begin assign {debug_wid_sel, debug_pc_sel} = 0; end @@ -219,7 +220,7 @@ module VX_bank #( end VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH + MSHR_ADDR_WIDTH), + .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH), .RESETW (1) ) pipe_reg0 ( .clk (clk), @@ -230,7 +231,7 @@ module VX_bank #( flush_enable, mrsq_enable || flush_enable, mshr_enable, - mshr_enable ? 1'b0 : creq_rw, + creq_fire && creq_rw, mshr_enable ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)), (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data : creq_line_data, mshr_enable ? mshr_wsel : creq_wsel, @@ -240,12 +241,12 @@ module VX_bank #( mshr_enable ? mshr_tag : creq_tag, mshr_enable ? mshr_dequeue_id : (mem_rsp_valid ? mem_rsp_id : mshr_alloc_id) }), - .data_out ({valid_st0, is_flush_st0, is_fill_st0, is_mshr_st0, mem_rw_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0}) + .data_out ({valid_st0, is_flush_st0, is_fill_st0, is_mshr_st0, write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0}) ); `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st0, debug_pc_st0} = tag_st0[`CACHE_REQ_INFO_RNG]; + assign {debug_wid_st0, debug_pc_st0} = tag_st0[0][`CACHE_REQ_INFO_RNG]; end else begin assign {debug_wid_st0, debug_pc_st0} = 0; end @@ -286,35 +287,33 @@ module VX_bank #( assign miss_st0 = !is_fill_st0 && !tag_match_st0; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH + MSHR_ADDR_WIDTH + 1), + .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (!crsq_stall), - .data_in ({valid_st0, is_fill_st0, is_mshr_st0, is_fill_st0, miss_st0, mem_rw_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0, mshr_pending_st0}), - .data_out ({valid_st1, is_fill_st1, is_mshr_st1, writeen_unqual_st1, miss_st1, mem_rw_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1}) + .data_in ({valid_st0, is_fill_st0, is_mshr_st0, miss_st0, write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0, mshr_pending_st0}), + .data_out ({valid_st1, is_fill_st1, is_mshr_st1, miss_st1, write_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1}) ); `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st1, debug_pc_st1} = tag_st1[`CACHE_REQ_INFO_RNG]; + assign {debug_wid_st1, debug_pc_st1} = tag_st1[0][`CACHE_REQ_INFO_RNG]; end else begin assign {debug_wid_st1, debug_pc_st1} = 0; end `endif - wire writeen_st1 = (WRITE_ENABLE && !is_fill_st1 && mem_rw_st1 && !miss_st1) - || writeen_unqual_st1; + wire read_st1 = !is_fill_st1 && !write_st1; - wire readen_st1 = !is_fill_st1 && !mem_rw_st1; + wire writeen_st1 = (WRITE_ENABLE && write_st1 && !miss_st1) + || is_fill_st1; - wire crsq_push_st1 = readen_st1 && !miss_st1; - - wire do_writeback_st1 = !is_fill_st1 && mem_rw_st1; + wire crsq_push_st1 = read_st1 && !miss_st1; - wire mreq_push_st1 = (readen_st1 && miss_st1 && !mshr_pending_st1) - || do_writeback_st1; + wire mreq_push_st1 = (read_st1 && miss_st1 && !mshr_pending_st1) + || write_st1; wire [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] line_byteen_st1; @@ -356,7 +355,7 @@ module VX_bank #( .addr (addr_st1), // reading - .readen (valid_st1 && readen_st1), + .readen (valid_st1 && read_st1), .rdata (rdata_st1), // writing @@ -368,8 +367,8 @@ module VX_bank #( wire mshr_allocate = creq_fire && ~creq_rw; wire mshr_replay = do_fill_st0 && ~crsq_stall; - wire mshr_lookup = valid_st0 && !is_fill_st0 && ~is_mshr_st0 && ~mem_rw_st0 && ~crsq_stall; - wire mshr_release = valid_st1 && readen_st1 && ~is_mshr_st1 && ~miss_st1 && ~crsq_stall; + wire mshr_lookup = valid_st0 && ~write_st0 && ~is_mshr_st0 && ~crsq_stall; + wire mshr_release = valid_st1 && read_st1 && ~is_mshr_st1 && ~miss_st1 && ~crsq_stall; wire mshr_not_full; @@ -433,7 +432,7 @@ module VX_bank #( wire [NUM_PORTS-1:0] crsq_pmask; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] crsq_data; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] crsq_tid; - wire [CORE_TAG_WIDTH-1:0] crsq_tag; + wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] crsq_tag; assign crsq_valid = valid_st1 && crsq_push_st1; assign crsq_stall = crsq_valid && !crsq_ready; @@ -451,7 +450,7 @@ module VX_bank #( end VX_elastic_buffer #( - .DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), + .DATAW ((CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), .SIZE (CRSQ_SIZE), .OUTPUT_REG (1 == NUM_BANKS) ) core_rsp_req ( @@ -467,24 +466,37 @@ module VX_bank #( // Enqueue memory request - wire [CACHE_LINE_SIZE-1:0] mreq_byteen; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mreq_data; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mreq_byteen; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mreq_wsel; wire [`LINE_ADDR_WIDTH-1:0] mreq_addr; wire [MSHR_ADDR_WIDTH-1:0] mreq_id; - wire [`CACHE_LINE_WIDTH-1:0] mreq_data; + wire mreq_push, mreq_pop, mreq_empty, mreq_rw; assign mreq_push = valid_st1 && mreq_push_st1; assign mreq_pop = mem_req_valid && mem_req_ready; - assign mreq_rw = WRITE_ENABLE && do_writeback_st1; - assign mreq_byteen = mreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}}; - assign mreq_addr = addr_st1; - assign mreq_id = mshr_id_st1; - assign mreq_data = wdata_st1; + assign mreq_rw = WRITE_ENABLE && write_st1; + assign mreq_addr = addr_st1; + assign mreq_id = mshr_id_st1; + assign mreq_wsel = wsel_st1; + + if (NUM_PORTS > 1) begin + for (genvar p = 0; p < NUM_PORTS; ++p) begin + assign mreq_byteen[p] = pmask_st1[p] ? byteen_st1[p] : WORD_SIZE'(0); + end + end else begin + assign mreq_byteen[0] = byteen_st1[0]; + end + + for (genvar p = 0; p < NUM_PORTS; ++p) begin + assign mreq_data[p] = wdata_st1[wsel_st1[p] * `WORD_WIDTH +: `WORD_WIDTH]; + end VX_fifo_queue #( - .DATAW (1 + CACHE_LINE_SIZE + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + `CACHE_LINE_WIDTH), + .DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), .SIZE (MREQ_SIZE), .ALM_FULL (MREQ_SIZE-2) ) mem_req_queue ( @@ -492,8 +504,8 @@ module VX_bank #( .reset (reset), .push (mreq_push), .pop (mreq_pop), - .data_in ({mreq_rw, mreq_byteen, mreq_addr, mreq_id, mreq_data}), - .data_out ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_id, mem_req_data}), + .data_in ({mreq_rw, mreq_addr, mreq_id, mreq_byteen, mreq_wsel, mreq_data}), + .data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}), .empty (mreq_empty), .alm_full (mreq_alm_full), `UNUSED_PIN (full), @@ -515,8 +527,8 @@ module VX_bank #( `SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); `ifdef PERF_ENABLE - assign perf_read_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && !mem_rw_st1; - assign perf_write_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && mem_rw_st1; + assign perf_read_misses = valid_st1 && read_st1 && !is_mshr_st1 && miss_st1; + assign perf_write_misses = valid_st1 && write_st1 && !is_mshr_st1 && miss_st1; assign perf_pipe_stalls = crsq_stall || mreq_alm_full || mshr_alm_full; assign perf_mshr_stalls = mshr_alm_full; `endif @@ -550,7 +562,7 @@ module VX_bank #( dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1); end if (mreq_push) begin - if (do_writeback_st1) + if (write_st1) dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1); else dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, debug_wid_st1, debug_pc_st1); diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index eab2004e..2429d5a6 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -44,7 +44,9 @@ module VX_cache #( parameter BANK_ADDR_OFFSET = 0, // enable bypass for non-cacheable addresses - parameter NC_ENABLE = 0 + parameter NC_ENABLE = 0, + + localparam WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS) ) ( `SCOPE_IO_VX_cache @@ -105,6 +107,29 @@ module VX_cache #( /////////////////////////////////////////////////////////////////////////// + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_p; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_p; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_p; + + reg [CACHE_LINE_SIZE-1:0] mem_req_byteen_r; + reg [`CACHE_LINE_WIDTH-1:0] mem_req_data_r; + + always @(*) begin + mem_req_byteen_r = 0; + mem_req_data_r = 'x; + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (mem_req_byteen_p[p] != 0) begin + mem_req_byteen_r[mem_req_wsel_p[p] * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p[p]; + mem_req_data_r[mem_req_wsel_p[p] * `WORD_WIDTH +: `WORD_WIDTH] = mem_req_data_p[p]; + end + end + end + + assign mem_req_byteen = mem_req_byteen_r; + assign mem_req_data = mem_req_data_r; + + /////////////////////////////////////////////////////////////////////////// + // Core request wire [NUM_REQS-1:0] core_req_valid_nc; wire [NUM_REQS-1:0] core_req_rw_nc; @@ -124,9 +149,10 @@ module VX_cache #( // Memory request wire mem_req_valid_nc; wire mem_req_rw_nc; - wire [CACHE_LINE_SIZE-1:0] mem_req_byteen_nc; wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc; - wire [`CACHE_LINE_WIDTH-1:0] mem_req_data_nc; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc; wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc; wire mem_req_ready_nc; @@ -138,6 +164,7 @@ module VX_cache #( if (NC_ENABLE) begin VX_nc_bypass #( + .NUM_PORTS (NUM_PORTS), .NUM_REQS (NUM_REQS), .NUM_RSP_TAGS (`CORE_RSP_TAGS), .NC_TAG_BIT (0), @@ -147,7 +174,7 @@ module VX_cache #( .CORE_TAG_IN_WIDTH (CORE_TAG_WIDTH), .MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH), - .MEM_DATA_SIZE (CACHE_LINE_SIZE), + .MEM_DATA_SIZE (CACHE_LINE_SIZE), .MEM_TAG_IN_WIDTH (MEM_TAG_IN_WIDTH), .MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH) ) nc_bypass ( @@ -188,19 +215,21 @@ module VX_cache #( // Memory request in .mem_req_valid_in (mem_req_valid_nc), - .mem_req_rw_in (mem_req_rw_nc), - .mem_req_byteen_in (mem_req_byteen_nc), + .mem_req_rw_in (mem_req_rw_nc), .mem_req_addr_in (mem_req_addr_nc), + .mem_req_byteen_in (mem_req_byteen_nc), + .mem_req_wsel_in (mem_req_wsel_nc), .mem_req_data_in (mem_req_data_nc), .mem_req_tag_in (mem_req_tag_nc), .mem_req_ready_in (mem_req_ready_nc), // Memory request out .mem_req_valid_out (mem_req_valid), - .mem_req_rw_out (mem_req_rw), - .mem_req_byteen_out (mem_req_byteen), + .mem_req_rw_out (mem_req_rw), .mem_req_addr_out (mem_req_addr), - .mem_req_data_out (mem_req_data), + .mem_req_byteen_out (mem_req_byteen_p), + .mem_req_wsel_out (mem_req_wsel_p), + .mem_req_data_out (mem_req_data_p), .mem_req_tag_out (mem_req_tag), .mem_req_ready_out (mem_req_ready), @@ -234,8 +263,9 @@ module VX_cache #( assign mem_req_valid = mem_req_valid_nc; assign mem_req_rw = mem_req_rw_nc; assign mem_req_addr = mem_req_addr_nc; - assign mem_req_byteen = mem_req_byteen_nc; - assign mem_req_data = mem_req_data_nc; + assign mem_req_byteen_p = mem_req_byteen_nc; + assign mem_req_wsel_p = mem_req_wsel_nc; + assign mem_req_data_p = mem_req_data_nc; assign mem_req_tag = mem_req_tag_nc; assign mem_req_ready_nc = mem_req_ready; @@ -293,28 +323,29 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_core_req_valid; wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_pmask; - wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] per_bank_core_req_wsel; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_req_tag; wire [NUM_BANKS-1:0] per_bank_core_req_rw; - wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr; - wire [NUM_BANKS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_req_tag; + wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr; wire [NUM_BANKS-1:0] per_bank_core_req_ready; wire [NUM_BANKS-1:0] per_bank_core_rsp_valid; wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_pmask; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_rsp_tid; - wire [NUM_BANKS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_rsp_tag; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_rsp_tag; wire [NUM_BANKS-1:0] per_bank_core_rsp_ready; wire [NUM_BANKS-1:0] per_bank_mem_req_valid; wire [NUM_BANKS-1:0] per_bank_mem_req_rw; - wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_mem_req_byteen; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] per_bank_mem_req_wsel; wire [NUM_BANKS-1:0][`MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr; wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id; - wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_mem_req_data; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_mem_req_data; wire [NUM_BANKS-1:0] per_bank_mem_req_ready; wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready; @@ -365,28 +396,29 @@ module VX_cache #( for (genvar i = 0; i < NUM_BANKS; i++) begin wire curr_bank_core_req_valid; wire [NUM_PORTS-1:0] curr_bank_core_req_pmask; - wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] curr_bank_core_req_wsel; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] curr_bank_core_req_wsel; wire [NUM_PORTS-1:0][WORD_SIZE-1:0] curr_bank_core_req_byteen; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_core_req_data; - wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_req_tid; + wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_req_tid; + wire [NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] curr_bank_core_req_tag; wire curr_bank_core_req_rw; - wire [`LINE_ADDR_WIDTH-1:0] curr_bank_core_req_addr; - wire [CORE_TAG_X_WIDTH-1:0] curr_bank_core_req_tag; + wire [`LINE_ADDR_WIDTH-1:0] curr_bank_core_req_addr; wire curr_bank_core_req_ready; wire curr_bank_core_rsp_valid; wire [NUM_PORTS-1:0] curr_bank_core_rsp_pmask; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_core_rsp_data; wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_rsp_tid; - wire [CORE_TAG_X_WIDTH-1:0] curr_bank_core_rsp_tag; + wire [NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] curr_bank_core_rsp_tag; wire curr_bank_core_rsp_ready; wire curr_bank_mem_req_valid; wire curr_bank_mem_req_rw; - wire [CACHE_LINE_SIZE-1:0] curr_bank_mem_req_byteen; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] curr_bank_mem_req_byteen; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] curr_bank_mem_req_wsel; wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr; wire [MSHR_ADDR_WIDTH-1:0] curr_bank_mem_req_id; - wire[`CACHE_LINE_WIDTH-1:0] curr_bank_mem_req_data; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_mem_req_data; wire curr_bank_mem_req_ready; wire curr_bank_mem_rsp_valid; @@ -419,6 +451,7 @@ module VX_cache #( assign per_bank_mem_req_valid[i] = curr_bank_mem_req_valid; assign per_bank_mem_req_rw[i] = curr_bank_mem_req_rw; assign per_bank_mem_req_byteen[i] = curr_bank_mem_req_byteen; + assign per_bank_mem_req_wsel[i] = curr_bank_mem_req_wsel; if (NUM_BANKS == 1) begin assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr; end else begin @@ -496,6 +529,7 @@ module VX_cache #( .mem_req_valid (curr_bank_mem_req_valid), .mem_req_rw (curr_bank_mem_req_rw), .mem_req_byteen (curr_bank_mem_req_byteen), + .mem_req_wsel (curr_bank_mem_req_wsel), .mem_req_addr (curr_bank_mem_req_addr), .mem_req_id (curr_bank_mem_req_id), .mem_req_data (curr_bank_mem_req_data), @@ -538,9 +572,9 @@ module VX_cache #( .core_rsp_ready (core_rsp_ready_nc) ); - wire [NUM_BANKS-1:0][(MEM_TAG_IN_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in; + wire [NUM_BANKS-1:0][(MEM_TAG_IN_WIDTH + 1 + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in; for (genvar i = 0; i < NUM_BANKS; i++) begin - assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_id[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_data[i]}; + assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_id[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_wsel[i], per_bank_mem_req_data[i]}; end wire [MSHR_ADDR_WIDTH-1:0] mem_req_id; @@ -549,7 +583,7 @@ module VX_cache #( VX_stream_arbiter #( .NUM_REQS (NUM_BANKS), - .DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH), + .DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), .BUFFERED (1) ) mem_req_arb ( .clk (clk), @@ -558,7 +592,7 @@ module VX_cache #( .data_in (data_in), .ready_in (per_bank_mem_req_ready), .valid_out (mem_req_valid_nc), - .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_byteen_nc, mem_req_data_nc}), + .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}), .ready_out (mem_req_ready_nc) ); diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 52f4f06a..4679c642 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -9,8 +9,10 @@ `define REQS_BITS `LOG2UP(NUM_REQS) -// tag valid tid word_sel -`define MSHR_DATA_WIDTH (CORE_TAG_WIDTH + (1 + `REQS_BITS + `UP(`WORD_SELECT_BITS)) * NUM_PORTS) +`define PORTS_BITS `LOG2UP(NUM_PORTS) + +// tag valid tid word_sel +`define MSHR_DATA_WIDTH ((CORE_TAG_WIDTH + 1 + `REQS_BITS + `UP(`WORD_SELECT_BITS)) * NUM_PORTS) `define WORD_WIDTH (8 * WORD_SIZE) diff --git a/hw/rtl/cache/VX_core_req_bank_sel.v b/hw/rtl/cache/VX_core_req_bank_sel.v index 2ff9616e..f09aaea2 100644 --- a/hw/rtl/cache/VX_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_core_req_bank_sel.v @@ -43,7 +43,7 @@ module VX_core_req_bank_sel #( output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen, output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data, output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid, - output wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag, + output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag, input wire [`BANK_READY_COUNT-1:0] per_bank_core_req_ready ); `UNUSED_PARAM (CACHE_ID) @@ -80,9 +80,9 @@ module VX_core_req_bank_sel #( reg [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen_r; reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data_r; reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid_r; + reg [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_r; reg [NUM_BANKS-1:0] per_bank_core_req_rw_r; reg [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr_r; - reg [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_r; reg [NUM_REQS-1:0] core_req_ready_r; if (NUM_REQS > 1) begin @@ -129,10 +129,9 @@ module VX_core_req_bank_sel #( per_bank_core_req_byteen_r[core_req_bid[i]][i % NUM_PORTS] = core_req_byteen[i]; per_bank_core_req_data_r[core_req_bid[i]][i % NUM_PORTS] = core_req_data[i]; per_bank_core_req_tid_r[core_req_bid[i]][i % NUM_PORTS] = `REQS_BITS'(i); + per_bank_core_req_tag_r[core_req_bid[i]][i % NUM_PORTS] = core_req_tag[i]; per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i]; per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i]; - per_bank_core_req_tag_r[core_req_bid[i]] = core_req_tag[i]; - req_select_table_r[core_req_bid[i]][i % NUM_PORTS] = (1 << i); end end @@ -177,9 +176,9 @@ module VX_core_req_bank_sel #( per_bank_core_req_byteen_r[core_req_bid[i]][i % NUM_PORTS] = core_req_byteen[i]; per_bank_core_req_data_r[core_req_bid[i]][i % NUM_PORTS] = core_req_data[i]; per_bank_core_req_tid_r[core_req_bid[i]][i % NUM_PORTS] = `REQS_BITS'(i); + per_bank_core_req_tag_r[core_req_bid[i]][i % NUM_PORTS] = core_req_tag[i]; per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i]; - per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i]; - per_bank_core_req_tag_r[core_req_bid[i]] = core_req_tag[i]; + per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i]; end end end diff --git a/hw/rtl/cache/VX_core_rsp_merge.v b/hw/rtl/cache/VX_core_rsp_merge.v index 14823b0d..6fe84690 100644 --- a/hw/rtl/cache/VX_core_rsp_merge.v +++ b/hw/rtl/cache/VX_core_rsp_merge.v @@ -24,7 +24,7 @@ module VX_core_rsp_merge #( input wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_pmask, input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data, input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_rsp_tid, - input wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag, + input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag, output wire [NUM_BANKS-1:0] per_bank_core_rsp_ready, // Core Response @@ -40,7 +40,7 @@ module VX_core_rsp_merge #( reg [NUM_REQS-1:0] core_rsp_valid_unqual; reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual; - reg [NUM_BANKS-1:0] core_rsp_bank_select; + reg [NUM_BANKS-1:0] per_bank_core_rsp_ready_r; if (CORE_TAG_ID_BITS != 0) begin @@ -51,61 +51,101 @@ module VX_core_rsp_merge #( reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; wire core_rsp_ready_unqual; - always @(*) begin - core_rsp_tag_unqual = 'x; - for (integer i = NUM_BANKS-1; i >= 0; --i) begin - if (per_bank_core_rsp_valid[i]) begin - core_rsp_tag_unqual = per_bank_core_rsp_tag[i]; - end - end - end - if (NUM_PORTS > 1) begin - always @(*) begin - core_rsp_valid_unqual = 0; - core_rsp_data_unqual = 'x; - core_rsp_bank_select = 0; - - for (integer i = 0; i < NUM_BANKS; i++) begin - for (integer p = 0; p < NUM_PORTS; p++) begin - if (per_bank_core_rsp_valid[i] - && per_bank_core_rsp_pmask[i][p] - && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin - core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1; - core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p]; - core_rsp_bank_select[i] = core_rsp_ready_unqual; + reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_r, per_bank_core_rsp_sent; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_n; + + for (genvar i = 0; i < NUM_BANKS; ++i) begin + assign per_bank_core_rsp_sent_n[i] = per_bank_core_rsp_sent_r[i] | per_bank_core_rsp_sent[i]; + end + + always @(posedge clk) begin + if (reset) begin + per_bank_core_rsp_sent_r <= '0; + end else begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + if (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]) begin + per_bank_core_rsp_sent_r[i] <= '0; + end else begin + per_bank_core_rsp_sent_r[i] <= per_bank_core_rsp_sent_n[i]; end end end end + always @(*) begin + core_rsp_tag_unqual = 'x; + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (per_bank_core_rsp_valid[i] + && per_bank_core_rsp_pmask[i][p] + && !per_bank_core_rsp_sent_r[i][p]) begin + core_rsp_tag_unqual = per_bank_core_rsp_tag[i][p]; + end + end + end + end + + always @(*) begin + core_rsp_valid_unqual = 0; + core_rsp_data_unqual = 'x; + per_bank_core_rsp_sent = 0; + + for (integer i = 0; i < NUM_BANKS; ++i) begin + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (per_bank_core_rsp_valid[i] + && per_bank_core_rsp_pmask[i][p] + && !per_bank_core_rsp_sent_r[i][p] + && (per_bank_core_rsp_tag[i][p][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin + core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1; + core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p]; + per_bank_core_rsp_sent[i][p] = core_rsp_ready_unqual; + end + end + end + end + + always @(*) begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + per_bank_core_rsp_ready_r[i] = (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]); + end + end + end else begin `UNUSED_VAR (per_bank_core_rsp_pmask) - - always @(*) begin - core_rsp_valid_unqual = 0; - core_rsp_data_unqual = 'x; - core_rsp_bank_select = 0; - - for (integer i = 0; i < NUM_BANKS; i++) begin - if (per_bank_core_rsp_valid[i] - && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin - core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; - core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - core_rsp_bank_select[i] = core_rsp_ready_unqual; + + always @(*) begin + core_rsp_tag_unqual = 'x; + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + if (per_bank_core_rsp_valid[i]) begin + core_rsp_tag_unqual = per_bank_core_rsp_tag[i]; end end end - - end + + always @(*) begin + core_rsp_valid_unqual = 0; + core_rsp_data_unqual = 'x; + per_bank_core_rsp_ready_r = 0; + + for (integer i = 0; i < NUM_BANKS; i++) begin + if (per_bank_core_rsp_valid[i] + && (per_bank_core_rsp_tag[i][0][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin + core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; + core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; + per_bank_core_rsp_ready_r[i] = core_rsp_ready_unqual; + end + end + end + end wire core_rsp_valid_any = (| per_bank_core_rsp_valid); VX_skid_buffer #( .DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)) - ) pipe_reg ( + ) skid_buf ( .clk (clk), .reset (reset), .valid_in (core_rsp_valid_any), @@ -118,40 +158,102 @@ module VX_core_rsp_merge #( end else begin - `UNUSED_VAR (per_bank_core_rsp_pmask) - reg [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; - reg [NUM_REQS-1:0][NUM_BANKS-1:0] bank_select_table; - wire [NUM_REQS-1:0] core_rsp_ready_unqual; - always @(*) begin - core_rsp_valid_unqual = 0; - core_rsp_tag_unqual = 'x; - core_rsp_data_unqual = 'x; - bank_select_table = 'x; - - for (integer i = NUM_BANKS-1; i >= 0; --i) begin - if (per_bank_core_rsp_valid[i]) begin - core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; - core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i]; - core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - bank_select_table[per_bank_core_rsp_tid[i]] = (1 << i); - end - end - end + if (NUM_PORTS > 1) begin - always @(*) begin - for (integer i = 0; i < NUM_BANKS; i++) begin - core_rsp_bank_select[i] = core_rsp_ready_unqual[per_bank_core_rsp_tid[i]] - && bank_select_table[per_bank_core_rsp_tid[i]][i]; - end + reg [NUM_REQS-1:0][(`PORTS_BITS + `BANK_SELECT_BITS)-1:0] bank_select_table; + + reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_r, per_bank_core_rsp_sent; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_n; + + for (genvar i = 0; i < NUM_BANKS; ++i) begin + assign per_bank_core_rsp_sent_n[i] = per_bank_core_rsp_sent_r[i] | per_bank_core_rsp_sent[i]; + end + + always @(posedge clk) begin + if (reset) begin + per_bank_core_rsp_sent_r <= '0; + end else begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + if (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]) begin + per_bank_core_rsp_sent_r[i] <= '0; + end else begin + per_bank_core_rsp_sent_r[i] <= per_bank_core_rsp_sent_n[i]; + end + end + end + end + + always @(*) begin + core_rsp_valid_unqual = '0; + core_rsp_tag_unqual = 'x; + core_rsp_data_unqual = 'x; + bank_select_table = 'x; + + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (per_bank_core_rsp_valid[i] + && per_bank_core_rsp_pmask[i][p] + && !per_bank_core_rsp_sent_r[i][p]) begin + core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1; + core_rsp_tag_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_tag[i][p]; + core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p]; + bank_select_table[per_bank_core_rsp_tid[i][p]] = {`PORTS_BITS'(p), `BANK_SELECT_BITS'(i)}; + end + end + end + end + + always @(*) begin + per_bank_core_rsp_sent = '0; + for (integer i = 0; i < NUM_REQS; i++) begin + if (core_rsp_valid_unqual[i]) begin + per_bank_core_rsp_sent[bank_select_table[i][0 +: `BANK_SELECT_BITS]][bank_select_table[i][`BANK_SELECT_BITS +: `PORTS_BITS]] = core_rsp_ready_unqual[i]; + end + end + end + + always @(*) begin + for (integer i = 0; i < NUM_BANKS; i++) begin + per_bank_core_rsp_ready_r[i] = (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]); + end + end + + end else begin + + `UNUSED_VAR (per_bank_core_rsp_pmask) + reg [NUM_REQS-1:0][NUM_BANKS-1:0] bank_select_table; + + always @(*) begin + core_rsp_valid_unqual = 0; + core_rsp_tag_unqual = 'x; + core_rsp_data_unqual = 'x; + bank_select_table = 'x; + + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + if (per_bank_core_rsp_valid[i]) begin + core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; + core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i]; + core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; + bank_select_table[per_bank_core_rsp_tid[i][i]] = (1 << i); + end + end + end + + always @(*) begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + per_bank_core_rsp_ready_r[i] = core_rsp_ready_unqual[per_bank_core_rsp_tid[i]] + && bank_select_table[per_bank_core_rsp_tid[i]][i]; + end + end end for (genvar i = 0; i < NUM_REQS; i++) begin VX_skid_buffer #( .DATAW (CORE_TAG_WIDTH + `WORD_WIDTH) - ) pipe_reg ( + ) skid_buf ( .clk (clk), .reset (reset), .valid_in (core_rsp_valid_unqual[i]), @@ -167,9 +269,7 @@ module VX_core_rsp_merge #( end - for (genvar i = 0; i < NUM_BANKS; i++) begin - assign per_bank_core_rsp_ready[i] = core_rsp_bank_select[i]; - end + assign per_bank_core_rsp_ready = per_bank_core_rsp_ready_r; end else begin diff --git a/hw/rtl/cache/VX_nc_bypass.v b/hw/rtl/cache/VX_nc_bypass.v index 93ef5847..f1e19df7 100644 --- a/hw/rtl/cache/VX_nc_bypass.v +++ b/hw/rtl/cache/VX_nc_bypass.v @@ -1,6 +1,7 @@ `include "VX_cache_define.vh" module VX_nc_bypass #( + parameter NUM_PORTS = 1, parameter NUM_REQS = 1, parameter NUM_RSP_TAGS = 0, parameter NC_TAG_BIT = 0, @@ -10,13 +11,14 @@ module VX_nc_bypass #( parameter CORE_TAG_IN_WIDTH = 1, parameter MEM_ADDR_WIDTH = 1, - parameter MEM_DATA_SIZE = 1, + parameter MEM_DATA_SIZE = 1, parameter MEM_TAG_IN_WIDTH = 1, parameter MEM_TAG_OUT_WIDTH = 1, - localparam CORE_DATA_WIDTH = CORE_DATA_SIZE * 8, - localparam MEM_DATA_WIDTH = MEM_DATA_SIZE * 8, - localparam CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - 1 + localparam CORE_DATA_WIDTH = CORE_DATA_SIZE * 8, + localparam MEM_DATA_WIDTH = MEM_DATA_SIZE * 8, + localparam CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - 1, + localparam MEM_SELECT_BITS = `UP(`CLOG2(MEM_DATA_SIZE / CORE_DATA_SIZE)) ) ( input wire clk, input wire reset, @@ -57,8 +59,9 @@ module VX_nc_bypass #( input wire mem_req_valid_in, input wire mem_req_rw_in, input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in, - input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in, - input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in, + input wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in, + input wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in, + input wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in, input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in, output wire mem_req_ready_in, @@ -66,8 +69,9 @@ module VX_nc_bypass #( output wire mem_req_valid_out, output wire mem_req_rw_out, output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out, - output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out, - output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out, + output wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_out, + output wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_out, + output wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_out, output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out, input wire mem_req_ready_out, @@ -148,7 +152,7 @@ module VX_nc_bypass #( assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid; assign mem_req_ready_in = mem_req_ready_out; - wire [(MEM_TAG_IN_WIDTH+1)-1:0] mem_req_tag_in_nc; + wire [(MEM_TAG_IN_WIDTH+1)-1:0] mem_req_tag_in_c; VX_bits_insert #( .N (MEM_TAG_IN_WIDTH), @@ -157,74 +161,66 @@ module VX_nc_bypass #( ) mem_req_tag_insert ( .data_in (mem_req_tag_in), .sel_in ('0), - .data_out (mem_req_tag_in_nc) + .data_out (mem_req_tag_in_c) ); + wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel; + wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel; + wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel; + wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel; + wire core_req_rw_in_sel; + if (NUM_REQS > 1) begin - - wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel; - wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel; - wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel; - wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel; - wire core_req_rw_in_sel; - wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in; for (genvar i = 0; i < NUM_REQS; ++i) begin assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]}; end assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_tid]; - - assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel; - assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH]; - - for (genvar i = 0; i < P; ++i) begin - assign mem_req_data_out[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] = mem_req_valid_in ? - mem_req_data_in[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] : core_req_data_in_sel; - end - - if (D != 0) begin - wire [D-1:0] req_addr_idx = core_req_addr_in_sel[D-1:0]; - reg [MEM_DATA_SIZE-1:0] mem_req_byteen_in_r; - always @(*) begin - mem_req_byteen_in_r = 0; - mem_req_byteen_in_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in_sel; - end - assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r; - assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel}); - end else begin - assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel; - assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, core_req_tag_in_sel}); - end end else begin - `UNUSED_VAR (core_req_nc_tid) + assign core_req_tag_in_sel = core_req_tag_in; + assign core_req_data_in_sel = core_req_data_in; + assign core_req_byteen_in_sel = core_req_byteen_in; + assign core_req_addr_in_sel = core_req_addr_in; + assign core_req_rw_in_sel = core_req_rw_in; + end + + assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel; + assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH]; + + if (D != 0) begin + reg [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r; + reg [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in_r; + reg [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r; + + wire [D-1:0] req_addr_idx = core_req_addr_in_sel[D-1:0]; - assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in; - assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in[0][D +: MEM_ADDR_WIDTH]; + always @(*) begin + mem_req_byteen_in_r = 0; + mem_req_byteen_in_r[0] = core_req_byteen_in_sel; - for (genvar i = 0; i < P; ++i) begin - assign mem_req_data_out[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] = mem_req_valid_in ? - mem_req_data_in[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] : core_req_data_in; + mem_req_wsel_in_r = 'x; + mem_req_wsel_in_r[0] = req_addr_idx; + + mem_req_data_in_r = 'x; + mem_req_data_in_r[0] = core_req_data_in_sel; end - if (D != 0) begin - wire [D-1:0] req_addr_idx = core_req_addr_in[0][D-1:0]; - reg [MEM_DATA_SIZE-1:0] mem_req_byteen_in_r; - always @(*) begin - mem_req_byteen_in_r = 0; - mem_req_byteen_in_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in; - end - assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r; - assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({req_addr_idx, core_req_tag_in}); - end else begin - assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in; - assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'(core_req_tag_in); - end + assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r; + assign mem_req_wsel_out = mem_req_valid_in ? mem_req_wsel_in : mem_req_wsel_in_r; + assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r; + assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel}); + end else begin + `UNUSED_VAR (mem_req_wsel_in) + assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel; + assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel; + assign mem_req_wsel_out = 0; + assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, core_req_tag_in_sel}); end // core response handling - wire [NUM_RSP_TAGS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out_unqual; + wire [NUM_RSP_TAGS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out_c; wire is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT]; @@ -236,7 +232,7 @@ module VX_nc_bypass #( ) core_rsp_tag_insert ( .data_in (core_rsp_tag_in[i]), .sel_in ('0), - .data_out (core_rsp_tag_out_unqual[i]) + .data_out (core_rsp_tag_out_c[i]) ); end @@ -262,14 +258,14 @@ module VX_nc_bypass #( for (genvar i = 0; i < NUM_REQS; ++i) begin assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? core_rsp_data_in[i] : mem_rsp_data_in; end - end + end for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_out_unqual[i] : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0]; + assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_out_c[i] : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0]; end end else begin assign core_rsp_valid_out = core_rsp_valid_in || is_mem_rsp_nc; - assign core_rsp_tag_out = core_rsp_valid_in ? core_rsp_tag_out_unqual : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0]; + assign core_rsp_tag_out = core_rsp_valid_in ? core_rsp_tag_out_c : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0]; assign core_rsp_ready_in = core_rsp_ready_out; if (NUM_REQS > 1) begin diff --git a/hw/rtl/cache/VX_tag_access.v b/hw/rtl/cache/VX_tag_access.v index 708220ae..b0b4226a 100644 --- a/hw/rtl/cache/VX_tag_access.v +++ b/hw/rtl/cache/VX_tag_access.v @@ -48,8 +48,6 @@ module VX_tag_access #( VX_sp_ram #( .DATAW (`TAG_SELECT_BITS + 1), .SIZE (`LINES_PER_BANK), - .INIT_ENABLE (1), - .INIT_VALUE (0), .NO_RWCHECK (1) ) tag_store ( .clk( clk), From 5392395fbade31eaa20b18d47fa1d3d3aa9932d1 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 28 Aug 2021 23:13:50 -0700 Subject: [PATCH 12/16] minor update --- hw/rtl/cache/VX_cache.v | 45 +++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 2429d5a6..eca019bc 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -110,23 +110,42 @@ module VX_cache #( wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_p; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_p; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_p; + wire mem_req_rw_p; - reg [CACHE_LINE_SIZE-1:0] mem_req_byteen_r; - reg [`CACHE_LINE_WIDTH-1:0] mem_req_data_r; + if (WRITE_ENABLE) begin - always @(*) begin - mem_req_byteen_r = 0; - mem_req_data_r = 'x; - for (integer p = 0; p < NUM_PORTS; ++p) begin - if (mem_req_byteen_p[p] != 0) begin - mem_req_byteen_r[mem_req_wsel_p[p] * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p[p]; - mem_req_data_r[mem_req_wsel_p[p] * `WORD_WIDTH +: `WORD_WIDTH] = mem_req_data_p[p]; + reg [CACHE_LINE_SIZE-1:0] mem_req_byteen_r; + reg [`CACHE_LINE_WIDTH-1:0] mem_req_data_r; + + always @(*) begin + mem_req_byteen_r = 0; + mem_req_data_r = 'x; + for (integer p = 0; p < NUM_PORTS; ++p) begin + if (mem_req_byteen_p[p] != 0) begin + mem_req_byteen_r[mem_req_wsel_p[p] * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p[p]; + mem_req_data_r[mem_req_wsel_p[p] * `WORD_WIDTH +: `WORD_WIDTH] = mem_req_data_p[p]; + end end end + + assign mem_req_rw = mem_req_rw_p; + assign mem_req_byteen = mem_req_byteen_r; + assign mem_req_data = mem_req_data_r; + + end else begin + + `UNUSED_VAR (mem_req_byteen_p) + `UNUSED_VAR (mem_req_wsel_p) + `UNUSED_VAR (mem_req_data_p) + `UNUSED_VAR (mem_req_rw_p) + + assign mem_req_rw = 0; + assign mem_req_byteen = 'x; + assign mem_req_data = 'x; + end - assign mem_req_byteen = mem_req_byteen_r; - assign mem_req_data = mem_req_data_r; + /////////////////////////////////////////////////////////////////////////// @@ -225,8 +244,8 @@ module VX_cache #( // Memory request out .mem_req_valid_out (mem_req_valid), - .mem_req_rw_out (mem_req_rw), .mem_req_addr_out (mem_req_addr), + .mem_req_rw_out (mem_req_rw_p), .mem_req_byteen_out (mem_req_byteen_p), .mem_req_wsel_out (mem_req_wsel_p), .mem_req_data_out (mem_req_data_p), @@ -261,8 +280,8 @@ module VX_cache #( assign core_rsp_ready_nc = core_rsp_ready; assign mem_req_valid = mem_req_valid_nc; - assign mem_req_rw = mem_req_rw_nc; assign mem_req_addr = mem_req_addr_nc; + assign mem_req_rw_p = mem_req_rw_nc; assign mem_req_byteen_p = mem_req_byteen_nc; assign mem_req_wsel_p = mem_req_wsel_nc; assign mem_req_data_p = mem_req_data_nc; From e26cfab04dd85ceccabaf6a242fe94a28c16867a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 29 Aug 2021 02:25:55 -0700 Subject: [PATCH 13/16] bank area optimization --- hw/rtl/cache/VX_bank.v | 64 +++++++++++++++-------------------- hw/rtl/cache/VX_data_access.v | 27 +++++++++------ 2 files changed, 43 insertions(+), 48 deletions(-) diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 933f189e..848a4278 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -149,7 +149,7 @@ module VX_bank #( wire [NUM_PORTS-1:0] pmask_st0, pmask_st1; wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] tag_st0, tag_st1; wire [`CACHE_LINE_WIDTH-1:0] rdata_st1; - wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1; + wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1; wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1; wire valid_st0, valid_st1; wire is_fill_st0, is_fill_st1; @@ -198,27 +198,6 @@ module VX_bank #( end `endif - wire [`CACHE_LINE_WIDTH-1:0] creq_line_data; - - if (`WORDS_PER_LINE > 1) begin - if (NUM_PORTS > 1) begin - reg [`CACHE_LINE_WIDTH-1:0] creq_line_data_r; - always @(*) begin - creq_line_data_r = 'x; - for (integer p = 0; p < NUM_PORTS; p++) begin - if (creq_pmask[p]) begin - creq_line_data_r[creq_wsel[p] * `WORD_WIDTH +: `WORD_WIDTH] = creq_data[p]; - end - end - end - assign creq_line_data = creq_line_data_r; - end else begin - assign creq_line_data = {`WORDS_PER_LINE{creq_data}}; - end - end else begin - assign creq_line_data = creq_data; - end - VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH), .RESETW (1) @@ -233,7 +212,7 @@ module VX_bank #( mshr_enable, creq_fire && creq_rw, mshr_enable ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)), - (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data : creq_line_data, + (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data : `CACHE_LINE_WIDTH'(creq_data), mshr_enable ? mshr_wsel : creq_wsel, creq_byteen, mshr_enable ? mshr_tid : creq_tid, @@ -315,23 +294,36 @@ module VX_bank #( wire mreq_push_st1 = (read_st1 && miss_st1 && !mshr_pending_st1) || write_st1; - wire [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] line_byteen_st1; + wire [`CACHE_LINE_WIDTH-1:0] line_wdata_st1; + wire [CACHE_LINE_SIZE-1:0] line_byteen_st1; + + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data_st1 = wdata_st1[0 +: NUM_PORTS * `WORD_WIDTH]; if (`WORDS_PER_LINE > 1) begin + reg [`CACHE_LINE_WIDTH-1:0] line_wdata_r; reg [CACHE_LINE_SIZE-1:0] line_byteen_r; always @(*) begin - line_byteen_r = 0; - for (integer p = 0; p < NUM_PORTS; p++) begin - if ((NUM_PORTS == 1) || pmask_st1[p]) begin - line_byteen_r[wsel_st1[p] * WORD_SIZE +: WORD_SIZE] = byteen_st1[p]; + line_wdata_r = 'x; + line_byteen_r = 0; + if (NUM_PORTS > 1) begin + for (integer p = 0; p < NUM_PORTS; p++) begin + if (creq_pmask[p]) begin + line_wdata_r[creq_wsel[p] * `WORD_WIDTH +: `WORD_WIDTH] = creq_data_st1[p]; + line_byteen_r[wsel_st1[p] * WORD_SIZE +: WORD_SIZE] = byteen_st1[p]; + end end + end else begin + line_wdata_r = {`WORDS_PER_LINE{creq_data_st1}}; + line_byteen_r[wsel_st1[0] * WORD_SIZE +: WORD_SIZE] = byteen_st1[0]; end end + assign line_wdata_st1 = line_wdata_r; assign line_byteen_st1 = line_byteen_r; end else begin - assign line_byteen_st1 = byteen_st1; `UNUSED_VAR (wsel_st1) - end + assign line_wdata_st1 = creq_data_st1; + assign line_byteen_st1 = byteen_st1; + end VX_data_access #( .BANK_ID (BANK_ID), @@ -356,13 +348,14 @@ module VX_bank #( // reading .readen (valid_st1 && read_st1), - .rdata (rdata_st1), + .read_data (rdata_st1), // writing .writeen (valid_st1 && writeen_st1), .is_fill (is_fill_st1), - .byteen (line_byteen_st1), - .wdata (wdata_st1) + .byteen (line_byteen_st1), + .write_data (line_wdata_st1), + .fill_data (wdata_st1) ); wire mshr_allocate = creq_fire && ~creq_rw; @@ -482,6 +475,7 @@ module VX_bank #( assign mreq_addr = addr_st1; assign mreq_id = mshr_id_st1; assign mreq_wsel = wsel_st1; + assign mreq_data = creq_data_st1; if (NUM_PORTS > 1) begin for (genvar p = 0; p < NUM_PORTS; ++p) begin @@ -490,10 +484,6 @@ module VX_bank #( end else begin assign mreq_byteen[0] = byteen_st1[0]; end - - for (genvar p = 0; p < NUM_PORTS; ++p) begin - assign mreq_data[p] = wdata_st1[wsel_st1[p] * `WORD_WIDTH +: `WORD_WIDTH]; - end VX_fifo_queue #( .DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), diff --git a/hw/rtl/cache/VX_data_access.v b/hw/rtl/cache/VX_data_access.v index 64541f7b..249d2eaf 100644 --- a/hw/rtl/cache/VX_data_access.v +++ b/hw/rtl/cache/VX_data_access.v @@ -32,13 +32,14 @@ module VX_data_access #( // reading input wire readen, - output wire [`CACHE_LINE_WIDTH-1:0] rdata, + output wire [`CACHE_LINE_WIDTH-1:0] read_data, // writing input wire writeen, input wire is_fill, input wire [CACHE_LINE_SIZE-1:0] byteen, - input wire [`CACHE_LINE_WIDTH-1:0] wdata + input wire [`CACHE_LINE_WIDTH-1:0] write_data, + input wire [`CACHE_LINE_WIDTH-1:0] fill_data ); `UNUSED_PARAM (CACHE_ID) @@ -50,16 +51,20 @@ module VX_data_access #( localparam BYTEENW = WRITE_ENABLE ? CACHE_LINE_SIZE : 1; wire [`LINE_SELECT_BITS-1:0] line_addr; - wire [BYTEENW-1:0] byte_enable; + wire [`CACHE_LINE_WIDTH-1:0] wdata; + wire [BYTEENW-1:0] wren; assign line_addr = addr[`LINE_SELECT_BITS-1:0]; if (WRITE_ENABLE) begin - assign byte_enable = is_fill ? {BYTEENW{1'b1}} : byteen; + assign wren = is_fill ? {BYTEENW{writeen}} : (byteen & {BYTEENW{writeen}}); + assign wdata = is_fill ? fill_data : write_data; end else begin - `UNUSED_VAR (byteen) `UNUSED_VAR (is_fill) - assign byte_enable = 1'b1; + `UNUSED_VAR (byteen) + `UNUSED_VAR (write_data) + assign wren = writeen; + assign wdata = fill_data; end VX_sp_ram #( @@ -70,10 +75,10 @@ module VX_data_access #( ) data_store ( .clk (clk), .addr (line_addr), - .wren ({BYTEENW{writeen}} & byte_enable), + .wren (wren), .wdata (wdata), .rden (1'b1), - .rdata (rdata) + .rdata (read_data) ); `UNUSED_VAR (stall) @@ -82,13 +87,13 @@ module VX_data_access #( always @(posedge clk) begin if (writeen && ~stall) begin if (is_fill) begin - dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, wdata); + dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); end else begin - dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byte_enable, line_addr, wdata); + dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byte_enable, line_addr, write_data); end end if (readen && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, rdata); + dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, read_data); end end `endif From 90b50277d07ed6aabd1725d774a2d80cb9e6e9e9 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 29 Aug 2021 18:33:49 -0700 Subject: [PATCH 14/16] cache multi-porting fixes + optimization --- ci/regression.sh | 16 +++++--- hw/rtl/VX_config.vh | 2 +- hw/rtl/VX_define.vh | 4 +- hw/rtl/cache/VX_bank.v | 66 ++++++++++++++++++-------------- hw/rtl/cache/VX_cache.v | 66 +++++++++++++++++++------------- hw/rtl/cache/VX_core_rsp_merge.v | 2 +- hw/rtl/cache/VX_data_access.v | 4 +- hw/rtl/cache/VX_nc_bypass.v | 9 ++++- 8 files changed, 100 insertions(+), 69 deletions(-) diff --git a/ci/regression.sh b/ci/regression.sh index 11f5d21a..7821d626 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -46,8 +46,8 @@ debug() { echo "begin debugging tests..." -./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1" -./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1" +./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1" +./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1" ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1" echo "debugging tests done!" @@ -72,14 +72,18 @@ FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood # using FPNEW FPU core FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood +# adjust l1 block size to match l2 +CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1" + # test cache banking -CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo -CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo +CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr +CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr # test cache multi-porting CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr -CONFIGS="-DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr -CONFIGS="-DL2NUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr +CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --debug --args="-n1" +CONFIGS="-DL2NUM_PORTS=2 -DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr +CONFIGS="-DL2NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=io_addr # test 128-bit MEM block CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 5124116e..1d16c7f7 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -38,7 +38,7 @@ `endif `ifndef L1_BLOCK_SIZE -`define L1_BLOCK_SIZE (`NUM_THREADS * 4) +`define L1_BLOCK_SIZE ((`L2_ENABLE || `L3_ENABLE) ? (`NUM_THREADS * 4) : `MEM_BLOCK_SIZE) `endif `ifndef STARTUP_ADDR diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 94287911..87f0001d 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -251,7 +251,7 @@ `define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0) // Block size in bytes -`define ICACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE) +`define ICACHE_LINE_SIZE `L1_BLOCK_SIZE // Word size in bytes `define IWORD_SIZE 4 @@ -289,7 +289,7 @@ `define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1) // Block size in bytes -`define DCACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE) +`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE // Word size in bytes `define DWORD_SIZE 4 diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 848a4278..f5497e0c 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -77,6 +77,7 @@ module VX_bank #( // Memory request output wire mem_req_valid, output wire mem_req_rw, + output wire [NUM_PORTS-1:0] mem_req_pmask, output wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen, output wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel, output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr, @@ -161,6 +162,7 @@ module VX_bank #( wire crsq_valid, crsq_ready, crsq_stall; wire mreq_alm_full; + // prevent read-during-write hazard when accessing tags/data block RAMs wire rdw_fill_hazard = valid_st0 && is_fill_st0; wire rdw_write_hazard = valid_st0 && write_st0 && ~creq_rw; @@ -174,14 +176,14 @@ module VX_bank #( wire creq_grant = !mshr_enable && !mrsq_enable && !flush_enable; wire mshr_ready = mshr_grant - && !rdw_fill_hazard // prevent read-during-write + && !rdw_fill_hazard // prevent read-during-write hazard && !crsq_stall; // ensure core response ready assign mem_rsp_ready = mrsq_grant && !crsq_stall; // ensure core response ready assign creq_ready = creq_grant - && !rdw_write_hazard // prevent read-during-write + && !rdw_write_hazard // prevent read-during-write hazard && !mreq_alm_full // ensure memory request ready && !mshr_alm_full // ensure mshr enqueue ready && !crsq_stall; // ensure core response ready @@ -198,6 +200,12 @@ module VX_bank #( end `endif + wire [`CACHE_LINE_WIDTH-1:0] wdata_sel; + assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data; + for (genvar i = NUM_PORTS * `WORD_WIDTH; i < `CACHE_LINE_WIDTH; ++i) begin + assign wdata_sel[i] = mem_rsp_data[i]; + end + VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH), .RESETW (1) @@ -212,7 +220,7 @@ module VX_bank #( mshr_enable, creq_fire && creq_rw, mshr_enable ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)), - (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data : `CACHE_LINE_WIDTH'(creq_data), + wdata_sel, mshr_enable ? mshr_wsel : creq_wsel, creq_byteen, mshr_enable ? mshr_tid : creq_tid, @@ -265,6 +273,8 @@ module VX_bank #( // we have a core request hit assign miss_st0 = !is_fill_st0 && !tag_match_st0; + wire read_st0 = !is_fill_st0 && !write_st0; + VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH + 1), .RESETW (1) @@ -302,19 +312,22 @@ module VX_bank #( if (`WORDS_PER_LINE > 1) begin reg [`CACHE_LINE_WIDTH-1:0] line_wdata_r; reg [CACHE_LINE_SIZE-1:0] line_byteen_r; - always @(*) begin - line_wdata_r = 'x; - line_byteen_r = 0; - if (NUM_PORTS > 1) begin - for (integer p = 0; p < NUM_PORTS; p++) begin - if (creq_pmask[p]) begin - line_wdata_r[creq_wsel[p] * `WORD_WIDTH +: `WORD_WIDTH] = creq_data_st1[p]; - line_byteen_r[wsel_st1[p] * WORD_SIZE +: WORD_SIZE] = byteen_st1[p]; + if (NUM_PORTS > 1) begin + always @(*) begin + line_wdata_r = 'x; + line_byteen_r = 0; + for (integer i = 0; i < NUM_PORTS; ++i) begin + if (pmask_st1[i]) begin + line_wdata_r[wsel_st1[i] * `WORD_WIDTH +: `WORD_WIDTH] = creq_data_st1[i]; + line_byteen_r[wsel_st1[i] * WORD_SIZE +: WORD_SIZE] = byteen_st1[i]; end end - end else begin + end + end else begin + always @(*) begin line_wdata_r = {`WORDS_PER_LINE{creq_data_st1}}; - line_byteen_r[wsel_st1[0] * WORD_SIZE +: WORD_SIZE] = byteen_st1[0]; + line_byteen_r = 0; + line_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1; end end assign line_wdata_st1 = line_wdata_r; @@ -360,8 +373,8 @@ module VX_bank #( wire mshr_allocate = creq_fire && ~creq_rw; wire mshr_replay = do_fill_st0 && ~crsq_stall; - wire mshr_lookup = valid_st0 && ~write_st0 && ~is_mshr_st0 && ~crsq_stall; - wire mshr_release = valid_st1 && read_st1 && ~is_mshr_st1 && ~miss_st1 && ~crsq_stall; + wire mshr_lookup = valid_st0 && read_st0 && !is_mshr_st0 && !crsq_stall; + wire mshr_release = valid_st1 && read_st1 && !is_mshr_st1 && !miss_st1 && !crsq_stall; wire mshr_not_full; @@ -435,15 +448,15 @@ module VX_bank #( assign crsq_tag = tag_st1; if (`WORDS_PER_LINE > 1) begin - for (genvar p = 0; p < NUM_PORTS; ++p) begin - assign crsq_data[p] = rdata_st1[wsel_st1[p] * `WORD_WIDTH +: `WORD_WIDTH]; + for (genvar i = 0; i < NUM_PORTS; ++i) begin + assign crsq_data[i] = rdata_st1[wsel_st1[i] * `WORD_WIDTH +: `WORD_WIDTH]; end end else begin assign crsq_data = rdata_st1; end VX_elastic_buffer #( - .DATAW ((CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), + .DATAW (NUM_PORTS * (CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS)), .SIZE (CRSQ_SIZE), .OUTPUT_REG (1 == NUM_BANKS) ) core_rsp_req ( @@ -462,6 +475,7 @@ module VX_bank #( wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mreq_data; wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mreq_byteen; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mreq_wsel; + wire [NUM_PORTS-1:0] mreq_pmask; wire [`LINE_ADDR_WIDTH-1:0] mreq_addr; wire [MSHR_ADDR_WIDTH-1:0] mreq_id; @@ -474,19 +488,13 @@ module VX_bank #( assign mreq_rw = WRITE_ENABLE && write_st1; assign mreq_addr = addr_st1; assign mreq_id = mshr_id_st1; + assign mreq_pmask= pmask_st1; assign mreq_wsel = wsel_st1; + assign mreq_byteen = byteen_st1; assign mreq_data = creq_data_st1; - if (NUM_PORTS > 1) begin - for (genvar p = 0; p < NUM_PORTS; ++p) begin - assign mreq_byteen[p] = pmask_st1[p] ? byteen_st1[p] : WORD_SIZE'(0); - end - end else begin - assign mreq_byteen[0] = byteen_st1[0]; - end - VX_fifo_queue #( - .DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), + .DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), .SIZE (MREQ_SIZE), .ALM_FULL (MREQ_SIZE-2) ) mem_req_queue ( @@ -494,8 +502,8 @@ module VX_bank #( .reset (reset), .push (mreq_push), .pop (mreq_pop), - .data_in ({mreq_rw, mreq_addr, mreq_id, mreq_byteen, mreq_wsel, mreq_data}), - .data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}), + .data_in ({mreq_rw, mreq_addr, mreq_id, mreq_pmask, mreq_byteen, mreq_wsel, mreq_data}), + .data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_pmask, mem_req_byteen, mem_req_wsel, mem_req_data}), .empty (mreq_empty), .alm_full (mreq_alm_full), `UNUSED_PIN (full), diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index eca019bc..e36155eb 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -107,34 +107,41 @@ module VX_cache #( /////////////////////////////////////////////////////////////////////////// - wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_p; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_p; + wire [NUM_PORTS-1:0] mem_req_pmask_p; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_p; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_p; - wire mem_req_rw_p; + wire mem_req_rw_p; if (WRITE_ENABLE) begin + if (`WORDS_PER_LINE > 1) begin + reg [CACHE_LINE_SIZE-1:0] mem_req_byteen_r; + reg [`CACHE_LINE_WIDTH-1:0] mem_req_data_r; - reg [CACHE_LINE_SIZE-1:0] mem_req_byteen_r; - reg [`CACHE_LINE_WIDTH-1:0] mem_req_data_r; - - always @(*) begin - mem_req_byteen_r = 0; - mem_req_data_r = 'x; - for (integer p = 0; p < NUM_PORTS; ++p) begin - if (mem_req_byteen_p[p] != 0) begin - mem_req_byteen_r[mem_req_wsel_p[p] * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p[p]; - mem_req_data_r[mem_req_wsel_p[p] * `WORD_WIDTH +: `WORD_WIDTH] = mem_req_data_p[p]; + always @(*) begin + mem_req_byteen_r = 0; + mem_req_data_r = 'x; + for (integer i = 0; i < NUM_PORTS; ++i) begin + if ((1 == NUM_PORTS) || mem_req_pmask_p[i]) begin + mem_req_byteen_r[mem_req_wsel_p[i] * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p[i]; + mem_req_data_r[mem_req_wsel_p[i] * `WORD_WIDTH +: `WORD_WIDTH] = mem_req_data_p[i]; + end end end + + assign mem_req_rw = mem_req_rw_p; + assign mem_req_byteen = mem_req_byteen_r; + assign mem_req_data = mem_req_data_r; + end else begin + `UNUSED_VAR (mem_req_pmask_p) + `UNUSED_VAR (mem_req_wsel_p) + assign mem_req_rw = mem_req_rw_p; + assign mem_req_byteen = mem_req_byteen_p; + assign mem_req_data = mem_req_data_p; end - - assign mem_req_rw = mem_req_rw_p; - assign mem_req_byteen = mem_req_byteen_r; - assign mem_req_data = mem_req_data_r; - end else begin - `UNUSED_VAR (mem_req_byteen_p) + `UNUSED_VAR (mem_req_pmask_p) `UNUSED_VAR (mem_req_wsel_p) `UNUSED_VAR (mem_req_data_p) `UNUSED_VAR (mem_req_rw_p) @@ -142,7 +149,6 @@ module VX_cache #( assign mem_req_rw = 0; assign mem_req_byteen = 'x; assign mem_req_data = 'x; - end @@ -169,7 +175,8 @@ module VX_cache #( wire mem_req_valid_nc; wire mem_req_rw_nc; wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc; - wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc; + wire [NUM_PORTS-1:0] mem_req_pmask_nc; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc; wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc; wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc; @@ -236,6 +243,7 @@ module VX_cache #( .mem_req_valid_in (mem_req_valid_nc), .mem_req_rw_in (mem_req_rw_nc), .mem_req_addr_in (mem_req_addr_nc), + .mem_req_pmask_in (mem_req_pmask_nc), .mem_req_byteen_in (mem_req_byteen_nc), .mem_req_wsel_in (mem_req_wsel_nc), .mem_req_data_in (mem_req_data_nc), @@ -246,6 +254,7 @@ module VX_cache #( .mem_req_valid_out (mem_req_valid), .mem_req_addr_out (mem_req_addr), .mem_req_rw_out (mem_req_rw_p), + .mem_req_pmask_out (mem_req_pmask_p), .mem_req_byteen_out (mem_req_byteen_p), .mem_req_wsel_out (mem_req_wsel_p), .mem_req_data_out (mem_req_data_p), @@ -282,6 +291,7 @@ module VX_cache #( assign mem_req_valid = mem_req_valid_nc; assign mem_req_addr = mem_req_addr_nc; assign mem_req_rw_p = mem_req_rw_nc; + assign mem_req_pmask_p = mem_req_pmask_nc; assign mem_req_byteen_p = mem_req_byteen_nc; assign mem_req_wsel_p = mem_req_wsel_nc; assign mem_req_data_p = mem_req_data_nc; @@ -360,7 +370,8 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_mem_req_valid; wire [NUM_BANKS-1:0] per_bank_mem_req_rw; - wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_mem_req_pmask; + wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] per_bank_mem_req_wsel; wire [NUM_BANKS-1:0][`MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr; wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id; @@ -433,6 +444,7 @@ module VX_cache #( wire curr_bank_mem_req_valid; wire curr_bank_mem_req_rw; + wire [NUM_PORTS-1:0] curr_bank_mem_req_pmask; wire [NUM_PORTS-1:0][WORD_SIZE-1:0] curr_bank_mem_req_byteen; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] curr_bank_mem_req_wsel; wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr; @@ -469,6 +481,7 @@ module VX_cache #( // Memory request assign per_bank_mem_req_valid[i] = curr_bank_mem_req_valid; assign per_bank_mem_req_rw[i] = curr_bank_mem_req_rw; + assign per_bank_mem_req_pmask[i] = curr_bank_mem_req_pmask; assign per_bank_mem_req_byteen[i] = curr_bank_mem_req_byteen; assign per_bank_mem_req_wsel[i] = curr_bank_mem_req_wsel; if (NUM_BANKS == 1) begin @@ -547,6 +560,7 @@ module VX_cache #( // Memory request .mem_req_valid (curr_bank_mem_req_valid), .mem_req_rw (curr_bank_mem_req_rw), + .mem_req_pmask (curr_bank_mem_req_pmask), .mem_req_byteen (curr_bank_mem_req_byteen), .mem_req_wsel (curr_bank_mem_req_wsel), .mem_req_addr (curr_bank_mem_req_addr), @@ -591,9 +605,9 @@ module VX_cache #( .core_rsp_ready (core_rsp_ready_nc) ); - wire [NUM_BANKS-1:0][(MEM_TAG_IN_WIDTH + 1 + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in; - for (genvar i = 0; i < NUM_BANKS; i++) begin - assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_id[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_wsel[i], per_bank_mem_req_data[i]}; + wire [NUM_BANKS-1:0][(MEM_TAG_IN_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in; + for (genvar i = 0; i < NUM_BANKS; ++i) begin + assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_id[i], per_bank_mem_req_rw[i], per_bank_mem_req_pmask[i], per_bank_mem_req_byteen[i], per_bank_mem_req_wsel[i], per_bank_mem_req_data[i]}; end wire [MSHR_ADDR_WIDTH-1:0] mem_req_id; @@ -602,7 +616,7 @@ module VX_cache #( VX_stream_arbiter #( .NUM_REQS (NUM_BANKS), - .DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), + .DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)), .BUFFERED (1) ) mem_req_arb ( .clk (clk), @@ -611,7 +625,7 @@ module VX_cache #( .data_in (data_in), .ready_in (per_bank_mem_req_ready), .valid_out (mem_req_valid_nc), - .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}), + .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_pmask_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}), .ready_out (mem_req_ready_nc) ); diff --git a/hw/rtl/cache/VX_core_rsp_merge.v b/hw/rtl/cache/VX_core_rsp_merge.v index 6fe84690..826cbb86 100644 --- a/hw/rtl/cache/VX_core_rsp_merge.v +++ b/hw/rtl/cache/VX_core_rsp_merge.v @@ -237,7 +237,7 @@ module VX_core_rsp_merge #( core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i]; core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - bank_select_table[per_bank_core_rsp_tid[i][i]] = (1 << i); + bank_select_table[per_bank_core_rsp_tid[i]] = (1 << i); end end end diff --git a/hw/rtl/cache/VX_data_access.v b/hw/rtl/cache/VX_data_access.v index 249d2eaf..36f33938 100644 --- a/hw/rtl/cache/VX_data_access.v +++ b/hw/rtl/cache/VX_data_access.v @@ -73,7 +73,7 @@ module VX_data_access #( .BYTEENW (BYTEENW), .NO_RWCHECK (1) ) data_store ( - .clk (clk), + .clk (clk), .addr (line_addr), .wren (wren), .wdata (wdata), @@ -89,7 +89,7 @@ module VX_data_access #( if (is_fill) begin dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); end else begin - dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byte_enable, line_addr, write_data); + dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, wren, line_addr, write_data); end end if (readen && ~stall) begin diff --git a/hw/rtl/cache/VX_nc_bypass.v b/hw/rtl/cache/VX_nc_bypass.v index f1e19df7..9ab575e2 100644 --- a/hw/rtl/cache/VX_nc_bypass.v +++ b/hw/rtl/cache/VX_nc_bypass.v @@ -59,7 +59,8 @@ module VX_nc_bypass #( input wire mem_req_valid_in, input wire mem_req_rw_in, input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in, - input wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in, + input wire [NUM_PORTS-1:0] mem_req_pmask_in, + input wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in, input wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in, input wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in, input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in, @@ -69,6 +70,7 @@ module VX_nc_bypass #( output wire mem_req_valid_out, output wire mem_req_rw_out, output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out, + output wire [NUM_PORTS-1:0] mem_req_pmask_out, output wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_out, output wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_out, output wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_out, @@ -188,7 +190,7 @@ module VX_nc_bypass #( assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel; assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH]; - if (D != 0) begin + if (D != 0) begin reg [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r; reg [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in_r; reg [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r; @@ -206,12 +208,15 @@ module VX_nc_bypass #( mem_req_data_in_r[0] = core_req_data_in_sel; end + assign mem_req_pmask_out = mem_req_valid_in ? mem_req_pmask_in : NUM_PORTS'(1'b1); assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r; assign mem_req_wsel_out = mem_req_valid_in ? mem_req_wsel_in : mem_req_wsel_in_r; assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r; assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel}); end else begin `UNUSED_VAR (mem_req_wsel_in) + `UNUSED_VAR (mem_req_pmask_in) + assign mem_req_pmask_out = 0; assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel; assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel; assign mem_req_wsel_out = 0; From a801a16062eb1fbc45896949e4fb1f0a0f63ccd6 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 29 Aug 2021 20:07:34 -0700 Subject: [PATCH 15/16] instruction decode refactoring fixing naming collision --- hw/rtl/VX_alu_unit.v | 32 ++-- hw/rtl/VX_csr_data.v | 18 +- hw/rtl/VX_csr_unit.v | 6 +- hw/rtl/VX_decode.v | 124 +++++++------- hw/rtl/VX_define.vh | 245 +++++++++++++-------------- hw/rtl/VX_execute.v | 6 +- hw/rtl/VX_fpu_unit.v | 4 +- hw/rtl/VX_gpu_unit.v | 8 +- hw/rtl/VX_ibuffer.v | 2 +- hw/rtl/VX_instr_demux.v | 32 ++-- hw/rtl/VX_lsu_unit.v | 22 +-- hw/rtl/VX_muldiv.v | 14 +- hw/rtl/VX_print_instr.vh | 156 ++++++++--------- hw/rtl/VX_types.vh | 2 +- hw/rtl/fp_cores/VX_fp_cvt.v | 10 +- hw/rtl/fp_cores/VX_fp_div.v | 2 +- hw/rtl/fp_cores/VX_fp_fma.v | 2 +- hw/rtl/fp_cores/VX_fp_ncomp.v | 30 ++-- hw/rtl/fp_cores/VX_fp_rounding.v | 12 +- hw/rtl/fp_cores/VX_fp_sqrt.v | 2 +- hw/rtl/fp_cores/VX_fpu_dpi.v | 34 ++-- hw/rtl/fp_cores/VX_fpu_fpga.v | 30 ++-- hw/rtl/fp_cores/VX_fpu_fpnew.v | 50 +++--- hw/rtl/interfaces/VX_alu_req_if.v | 4 +- hw/rtl/interfaces/VX_csr_req_if.v | 2 +- hw/rtl/interfaces/VX_decode_if.v | 4 +- hw/rtl/interfaces/VX_fpu_req_if.v | 4 +- hw/rtl/interfaces/VX_fpu_to_csr_if.v | 2 +- hw/rtl/interfaces/VX_gpu_req_if.v | 2 +- hw/rtl/interfaces/VX_ibuffer_if.v | 4 +- hw/rtl/interfaces/VX_lsu_req_if.v | 2 +- hw/scripts/scope.json | 4 +- 32 files changed, 434 insertions(+), 437 deletions(-) diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 8f7f5e90..7934a4b3 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -25,12 +25,12 @@ module VX_alu_unit #( wire stall_in, stall_out; `UNUSED_VAR (alu_req_if.op_mod) - wire is_br_op = `ALU_IS_BR(alu_req_if.op_mod); - wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op_type); - wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.op_type); - wire alu_signed = `ALU_SIGNED(alu_op); - wire [1:0] alu_op_class = `ALU_OP_CLASS(alu_op); - wire is_sub = (alu_op == `ALU_SUB); + wire is_br_op = `INST_ALU_IS_BR(alu_req_if.op_mod); + wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_OP(alu_req_if.op_type); + wire [`INST_BR_BITS-1:0] br_op = `INST_BR_OP(alu_req_if.op_type); + wire alu_signed = `INST_ALU_SIGNED(alu_op); + wire [1:0] alu_op_class = `INST_ALU_OP_CLASS(alu_op); + wire is_sub = (alu_op == `INST_ALU_SUB); wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data; wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data; @@ -57,10 +57,10 @@ module VX_alu_unit #( for (genvar i = 0; i < `NUM_THREADS; i++) begin always @(*) begin case (alu_op) - `ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; - `ALU_OR: msc_result[i] = alu_in1[i] | alu_in2_imm[i]; - `ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; - //`ALU_SLL, + `INST_ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; + `INST_ALU_OR: msc_result[i] = alu_in1[i] | alu_in2_imm[i]; + `INST_ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; + //`INST_ALU_SLL, default: msc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0]; endcase end @@ -81,7 +81,7 @@ module VX_alu_unit #( // branch - wire is_jal = is_br_op && (br_op == `BR_JAL || br_op == `BR_JALR); + wire is_jal = is_br_op && (br_op == `INST_BR_JAL || br_op == `INST_BR_JALR); wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result; wire [31:0] br_dest = add_result[alu_req_if.tid]; @@ -90,9 +90,9 @@ module VX_alu_unit #( wire is_less = cmp_result[32]; wire is_equal = ~(| cmp_result[31:0]); - wire br_neg = `BR_NEG(br_op); - wire br_less = `BR_LESS(br_op); - wire br_static = `BR_STATIC(br_op); + wire br_neg = `INST_BR_NEG(br_op); + wire br_less = `INST_BR_LESS(br_op); + wire br_static = `INST_BR_STATIC(br_op); wire br_taken = ((br_less ? is_less : is_equal) ^ br_neg) | br_static; // output @@ -118,14 +118,14 @@ module VX_alu_unit #( wire mul_wb; wire [`NUM_THREADS-1:0][31:0] mul_data; - wire is_mul_op = `ALU_IS_MUL(alu_req_if.op_mod); + wire is_mul_op = `INST_ALU_IS_MUL(alu_req_if.op_mod); VX_muldiv muldiv ( .clk (clk), .reset (reset), // Inputs - .alu_op (`MUL_OP(alu_req_if.op_type)), + .alu_op (`INST_MUL_OP(alu_req_if.op_type)), .wid_in (alu_req_if.wid), .tmask_in (alu_req_if.tmask), .PC_in (alu_req_if.PC), diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index 3aa3a17e..e77f28b0 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -42,7 +42,7 @@ module VX_csr_data #( reg [63:0] csr_cycle; reg [63:0] csr_instret; - reg [`NUM_WARPS-1:0][`FRM_BITS+`FFG_BITS-1:0] fcsr; + reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr; always @(posedge clk) begin @@ -52,16 +52,16 @@ module VX_csr_data #( end if (fpu_to_csr_if.write_enable) begin - fcsr[fpu_to_csr_if.write_wid][`FFG_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFG_BITS-1:0] - | fpu_to_csr_if.write_fflags; + fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] + | fpu_to_csr_if.write_fflags; end `endif if (write_enable) begin case (write_addr) - `CSR_FFLAGS: fcsr[write_wid][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; - `CSR_FRM: fcsr[write_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; - `CSR_FCSR: fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; + `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; + `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; + `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; `CSR_SATP: csr_satp <= write_data; @@ -104,8 +104,8 @@ module VX_csr_data #( read_data_r = 'x; read_addr_valid_r = 1; case (read_addr) - `CSR_FFLAGS : read_data_r = 32'(fcsr[read_wid][`FFG_BITS-1:0]); - `CSR_FRM : read_data_r = 32'(fcsr[read_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS]); + `CSR_FFLAGS : read_data_r = 32'(fcsr[read_wid][`FFLAGS_BITS-1:0]); + `CSR_FRM : read_data_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS]); `CSR_FCSR : read_data_r = 32'(fcsr[read_wid]); `CSR_WTID , @@ -222,7 +222,7 @@ module VX_csr_data #( assign read_data = read_data_r; `ifdef EXT_F_ENABLE - assign fpu_to_csr_if.read_frm = fcsr[fpu_to_csr_if.read_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS]; + assign fpu_to_csr_if.read_frm = fcsr[fpu_to_csr_if.read_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS]; `endif endmodule \ No newline at end of file diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 54cad7e7..a41c1875 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -70,14 +70,14 @@ module VX_csr_unit #( always @(*) begin csr_we_s0_unqual = (csr_req_data != 0); case (csr_req_if.op_type) - `CSR_RW: begin + `INST_CSR_RW: begin csr_updated_data = csr_req_data; csr_we_s0_unqual = 1; end - `CSR_RS: begin + `INST_CSR_RS: begin csr_updated_data = csr_read_data_qual | csr_req_data; end - //`CSR_RC + //`INST_CSR_RC default: begin csr_updated_data = csr_read_data_qual & ~csr_req_data; end diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index eac4e60e..832a1537 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -32,8 +32,8 @@ module VX_decode #( `UNUSED_VAR (reset) reg [`EX_BITS-1:0] ex_type; - reg [`OP_BITS-1:0] op_type; - reg [`MOD_BITS-1:0] op_mod; + reg [`INST_OP_BITS-1:0] op_type; + reg [`INST_MOD_BITS-1:0] op_mod; reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r; reg [31:0] imm; reg use_rd, use_PC, use_imm; @@ -79,14 +79,14 @@ module VX_decode #( `INST_I: begin ex_type = `EX_ALU; case (func3) - 3'h0: op_type = `OP_BITS'(`ALU_ADD); - 3'h1: op_type = `OP_BITS'(`ALU_SLL); - 3'h2: op_type = `OP_BITS'(`ALU_SLT); - 3'h3: op_type = `OP_BITS'(`ALU_SLTU); - 3'h4: op_type = `OP_BITS'(`ALU_XOR); - 3'h5: op_type = (func7[5]) ? `OP_BITS'(`ALU_SRA) : `OP_BITS'(`ALU_SRL); - 3'h6: op_type = `OP_BITS'(`ALU_OR); - 3'h7: op_type = `OP_BITS'(`ALU_AND); + 3'h0: op_type = `INST_OP_BITS'(`INST_ALU_ADD); + 3'h1: op_type = `INST_OP_BITS'(`INST_ALU_SLL); + 3'h2: op_type = `INST_OP_BITS'(`INST_ALU_SLT); + 3'h3: op_type = `INST_OP_BITS'(`INST_ALU_SLTU); + 3'h4: op_type = `INST_OP_BITS'(`INST_ALU_XOR); + 3'h5: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SRA) : `INST_OP_BITS'(`INST_ALU_SRL); + 3'h6: op_type = `INST_OP_BITS'(`INST_ALU_OR); + 3'h7: op_type = `INST_OP_BITS'(`INST_ALU_AND); default:; endcase use_rd = 1; @@ -100,14 +100,14 @@ module VX_decode #( `ifdef EXT_F_ENABLE if (func7[0]) begin case (func3) - 3'h0: op_type = `OP_BITS'(`MUL_MUL); - 3'h1: op_type = `OP_BITS'(`MUL_MULH); - 3'h2: op_type = `OP_BITS'(`MUL_MULHSU); - 3'h3: op_type = `OP_BITS'(`MUL_MULHU); - 3'h4: op_type = `OP_BITS'(`MUL_DIV); - 3'h5: op_type = `OP_BITS'(`MUL_DIVU); - 3'h6: op_type = `OP_BITS'(`MUL_REM); - 3'h7: op_type = `OP_BITS'(`MUL_REMU); + 3'h0: op_type = `INST_OP_BITS'(`INST_MUL_MUL); + 3'h1: op_type = `INST_OP_BITS'(`INST_MUL_MULH); + 3'h2: op_type = `INST_OP_BITS'(`INST_MUL_MULHSU); + 3'h3: op_type = `INST_OP_BITS'(`INST_MUL_MULHU); + 3'h4: op_type = `INST_OP_BITS'(`INST_MUL_DIV); + 3'h5: op_type = `INST_OP_BITS'(`INST_MUL_DIVU); + 3'h6: op_type = `INST_OP_BITS'(`INST_MUL_REM); + 3'h7: op_type = `INST_OP_BITS'(`INST_MUL_REMU); default:; endcase op_mod = 2; @@ -115,14 +115,14 @@ module VX_decode #( `endif begin case (func3) - 3'h0: op_type = (func7[5]) ? `OP_BITS'(`ALU_SUB) : `OP_BITS'(`ALU_ADD); - 3'h1: op_type = `OP_BITS'(`ALU_SLL); - 3'h2: op_type = `OP_BITS'(`ALU_SLT); - 3'h3: op_type = `OP_BITS'(`ALU_SLTU); - 3'h4: op_type = `OP_BITS'(`ALU_XOR); - 3'h5: op_type = (func7[5]) ? `OP_BITS'(`ALU_SRA) : `OP_BITS'(`ALU_SRL); - 3'h6: op_type = `OP_BITS'(`ALU_OR); - 3'h7: op_type = `OP_BITS'(`ALU_AND); + 3'h0: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SUB) : `INST_OP_BITS'(`INST_ALU_ADD); + 3'h1: op_type = `INST_OP_BITS'(`INST_ALU_SLL); + 3'h2: op_type = `INST_OP_BITS'(`INST_ALU_SLT); + 3'h3: op_type = `INST_OP_BITS'(`INST_ALU_SLTU); + 3'h4: op_type = `INST_OP_BITS'(`INST_ALU_XOR); + 3'h5: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SRA) : `INST_OP_BITS'(`INST_ALU_SRL); + 3'h6: op_type = `INST_OP_BITS'(`INST_ALU_OR); + 3'h7: op_type = `INST_OP_BITS'(`INST_ALU_AND); default:; endcase end @@ -133,7 +133,7 @@ module VX_decode #( end `INST_LUI: begin ex_type = `EX_ALU; - op_type = `OP_BITS'(`ALU_LUI); + op_type = `INST_OP_BITS'(`INST_ALU_LUI); use_rd = 1; use_imm = 1; imm = {upper_imm, 12'(0)}; @@ -142,7 +142,7 @@ module VX_decode #( end `INST_AUIPC: begin ex_type = `EX_ALU; - op_type = `OP_BITS'(`ALU_AUIPC); + op_type = `INST_OP_BITS'(`INST_ALU_AUIPC); use_rd = 1; use_imm = 1; use_PC = 1; @@ -151,7 +151,7 @@ module VX_decode #( end `INST_JAL: begin ex_type = `EX_ALU; - op_type = `OP_BITS'(`BR_JAL); + op_type = `INST_OP_BITS'(`INST_BR_JAL); op_mod = 1; use_rd = 1; use_imm = 1; @@ -162,7 +162,7 @@ module VX_decode #( end `INST_JALR: begin ex_type = `EX_ALU; - op_type = `OP_BITS'(`BR_JALR); + op_type = `INST_OP_BITS'(`INST_BR_JALR); op_mod = 1; use_rd = 1; use_imm = 1; @@ -174,12 +174,12 @@ module VX_decode #( `INST_B: begin ex_type = `EX_ALU; case (func3) - 3'h0: op_type = `OP_BITS'(`BR_EQ); - 3'h1: op_type = `OP_BITS'(`BR_NE); - 3'h4: op_type = `OP_BITS'(`BR_LT); - 3'h5: op_type = `OP_BITS'(`BR_GE); - 3'h6: op_type = `OP_BITS'(`BR_LTU); - 3'h7: op_type = `OP_BITS'(`BR_GEU); + 3'h0: op_type = `INST_OP_BITS'(`INST_BR_EQ); + 3'h1: op_type = `INST_OP_BITS'(`INST_BR_NE); + 3'h4: op_type = `INST_OP_BITS'(`INST_BR_LT); + 3'h5: op_type = `INST_OP_BITS'(`INST_BR_GE); + 3'h6: op_type = `INST_OP_BITS'(`INST_BR_LTU); + 3'h7: op_type = `INST_OP_BITS'(`INST_BR_GEU); default:; endcase op_mod = 1; @@ -192,13 +192,13 @@ module VX_decode #( end `INST_F: begin ex_type = `EX_LSU; - op_type = `OP_BITS'(func3[0]); - op_mod = `MOD_BITS'(1); + op_type = `INST_OP_BITS'(func3[0]); + op_mod = `INST_MOD_BITS'(1); end `INST_SYS : begin if (func3[1:0] != 0) begin ex_type = `EX_CSR; - op_type = `OP_BITS'(func3[1:0]); + op_type = `INST_OP_BITS'(func3[1:0]); use_rd = 1; use_imm = func3[2]; imm = 32'(u_12); // addr @@ -211,11 +211,11 @@ module VX_decode #( end else begin ex_type = `EX_ALU; case (u_12) - 12'h000: op_type = `OP_BITS'(`BR_ECALL); - 12'h001: op_type = `OP_BITS'(`BR_EBREAK); - 12'h302: op_type = `OP_BITS'(`BR_MRET); - 12'h102: op_type = `OP_BITS'(`BR_SRET); - 12'h7B2: op_type = `OP_BITS'(`BR_DRET); + 12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL); + 12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK); + 12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET); + 12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET); + 12'h7B2: op_type = `INST_OP_BITS'(`INST_BR_DRET); default:; endcase op_mod = 1; @@ -232,7 +232,7 @@ module VX_decode #( `endif `INST_L: begin ex_type = `EX_LSU; - op_type = `OP_BITS'({1'b0, func3}); + op_type = `INST_OP_BITS'({1'b0, func3}); use_rd = 1; imm = {{20{u_12[11]}}, u_12}; `ifdef EXT_F_ENABLE @@ -248,7 +248,7 @@ module VX_decode #( `endif `INST_S: begin ex_type = `EX_LSU; - op_type = `OP_BITS'({1'b1, func3}); + op_type = `INST_OP_BITS'({1'b1, func3}); imm = {{20{s_imm[11]}}, s_imm}; `USED_IREG (rs1); `ifdef EXT_F_ENABLE @@ -264,7 +264,7 @@ module VX_decode #( `INST_FNMSUB, `INST_FNMADD: begin ex_type = `EX_FPU; - op_type = `OP_BITS'(opcode[3:0]); + op_type = `INST_OP_BITS'(opcode[3:0]); op_mod = func3; use_rd = 1; `USED_FREG (rd); @@ -281,35 +281,35 @@ module VX_decode #( 7'h04, // FSUB 7'h08, // FMUL 7'h0C: begin // FDIV - op_type = `OP_BITS'(func7[3:0]); + op_type = `INST_OP_BITS'(func7[3:0]); `USED_FREG (rd); `USED_FREG (rs1); `USED_FREG (rs2); end 7'h2C: begin - op_type = `OP_BITS'(`FPU_SQRT); + op_type = `INST_OP_BITS'(`INST_FPU_SQRT); `USED_FREG (rd); `USED_FREG (rs1); end 7'h50: begin - op_type = `OP_BITS'(`FPU_CMP); + op_type = `INST_OP_BITS'(`INST_FPU_CMP); `USED_IREG (rd); `USED_FREG (rs1); `USED_FREG (rs2); end 7'h60: begin - op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTWUS) : `OP_BITS'(`FPU_CVTWS); + op_type = (instr[20]) ? `INST_OP_BITS'(`INST_FPU_CVTWUS) : `INST_OP_BITS'(`INST_FPU_CVTWS); `USED_IREG (rd); `USED_FREG (rs1); end 7'h68: begin - op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTSWU) : `OP_BITS'(`FPU_CVTSW); + op_type = (instr[20]) ? `INST_OP_BITS'(`INST_FPU_CVTSWU) : `INST_OP_BITS'(`INST_FPU_CVTSW); `USED_FREG (rd); `USED_IREG (rs1); end 7'h10: begin // FSGNJ=0, FSGNJN=1, FSGNJX=2 - op_type = `OP_BITS'(`FPU_MISC); + op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_mod = {1'b0, func3[1:0]}; `USED_FREG (rd); `USED_FREG (rs1); @@ -317,7 +317,7 @@ module VX_decode #( end 7'h14: begin // FMIN=3, FMAX=4 - op_type = `OP_BITS'(`FPU_MISC); + op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_mod = func3[0] ? 4 : 3; `USED_FREG (rd); `USED_FREG (rs1); @@ -326,10 +326,10 @@ module VX_decode #( 7'h70: begin if (func3[0]) begin // FCLASS - op_type = `OP_BITS'(`FPU_CLASS); + op_type = `INST_OP_BITS'(`INST_FPU_CLASS); end else begin // FMV.X.W=5 - op_type = `OP_BITS'(`FPU_MISC); + op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_mod = 5; end `USED_IREG (rd); @@ -337,7 +337,7 @@ module VX_decode #( end 7'h78: begin // FMV.W.X=6 - op_type = `OP_BITS'(`FPU_MISC); + op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_mod = 6; `USED_FREG (rd); `USED_IREG (rs1); @@ -350,26 +350,26 @@ module VX_decode #( ex_type = `EX_GPU; case (func3) 3'h0: begin - op_type = `OP_BITS'(`GPU_TMC); + op_type = `INST_OP_BITS'(`INST_GPU_TMC); is_wstall = 1; `USED_IREG (rs1); end 3'h1: begin - op_type = `OP_BITS'(`GPU_WSPAWN); + op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN); `USED_IREG (rs1); `USED_IREG (rs2); end 3'h2: begin - op_type = `OP_BITS'(`GPU_SPLIT); + op_type = `INST_OP_BITS'(`INST_GPU_SPLIT); is_wstall = 1; `USED_IREG (rs1); end 3'h3: begin - op_type = `OP_BITS'(`GPU_JOIN); + op_type = `INST_OP_BITS'(`INST_GPU_JOIN); is_join = 1; end 3'h4: begin - op_type = `OP_BITS'(`GPU_BAR); + op_type = `INST_OP_BITS'(`INST_GPU_BAR); is_wstall = 1; `USED_IREG (rs1); `USED_IREG (rs2); diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 87f0001d..fe1b12c3 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -14,8 +14,6 @@ `define NB_BITS `LOG2UP(`NUM_BARRIERS) -`define REQS_BITS `LOG2UP(NUM_REQS) - `ifdef EXT_F_ENABLE `define NUM_REGS 64 `else @@ -32,6 +30,16 @@ /////////////////////////////////////////////////////////////////////////////// +`define EX_NOP 3'h0 +`define EX_ALU 3'h1 +`define EX_LSU 3'h2 +`define EX_CSR 3'h3 +`define EX_FPU 3'h4 +`define EX_GPU 3'h5 +`define EX_BITS 3 + +/////////////////////////////////////////////////////////////////////////////// + `define INST_LUI 7'b0110111 `define INST_AUIPC 7'b0010111 `define INST_JAL 7'b1101111 @@ -56,142 +64,131 @@ /////////////////////////////////////////////////////////////////////////////// -`define FRM_RNE 3'b000 // round to nearest even -`define FRM_RTZ 3'b001 // round to zero -`define FRM_RDN 3'b010 // round to -inf -`define FRM_RUP 3'b011 // round to +inf -`define FRM_RMM 3'b100 // round to nearest max magnitude -`define FRM_DYN 3'b111 // dynamic mode -`define FRM_BITS 3 +`define INST_FRM_RNE 3'b000 // round to nearest even +`define INST_FRM_RTZ 3'b001 // round to zero +`define INST_FRM_RDN 3'b010 // round to -inf +`define INST_FRM_RUP 3'b011 // round to +inf +`define INST_FRM_RMM 3'b100 // round to nearest max magnitude +`define INST_FRM_DYN 3'b111 // dynamic mode +`define INST_FRM_BITS 3 /////////////////////////////////////////////////////////////////////////////// -`define EX_NOP 3'h0 -`define EX_ALU 3'h1 -`define EX_LSU 3'h2 -`define EX_CSR 3'h3 -`define EX_FPU 3'h4 -`define EX_GPU 3'h5 -`define EX_BITS 3 - -`define NUM_EXS 6 -`define NE_BITS `LOG2UP(`NUM_EXS) +`define INST_OP_BITS 4 +`define INST_MOD_BITS 3 /////////////////////////////////////////////////////////////////////////////// -`define OP_BITS 4 -`define MOD_BITS 3 +`define INST_ALU_ADD 4'b0000 +`define INST_ALU_LUI 4'b0010 +`define INST_ALU_AUIPC 4'b0011 +`define INST_ALU_SLTU 4'b0100 +`define INST_ALU_SLT 4'b0101 +`define INST_ALU_SRL 4'b1000 +`define INST_ALU_SRA 4'b1001 +`define INST_ALU_SUB 4'b1011 +`define INST_ALU_AND 4'b1100 +`define INST_ALU_OR 4'b1101 +`define INST_ALU_XOR 4'b1110 +`define INST_ALU_SLL 4'b1111 +`define INST_ALU_OTHER 4'b0111 +`define INST_ALU_BITS 4 +`define INST_ALU_OP(x) x[`INST_ALU_BITS-1:0] +`define INST_ALU_OP_CLASS(x) x[3:2] +`define INST_ALU_SIGNED(x) x[0] +`define INST_ALU_IS_BR(x) x[0] +`define INST_ALU_IS_MUL(x) x[1] -`define ALU_ADD 4'b0000 -`define ALU_LUI 4'b0010 -`define ALU_AUIPC 4'b0011 -`define ALU_SLTU 4'b0100 -`define ALU_SLT 4'b0101 -`define ALU_SRL 4'b1000 -`define ALU_SRA 4'b1001 -`define ALU_SUB 4'b1011 -`define ALU_AND 4'b1100 -`define ALU_OR 4'b1101 -`define ALU_XOR 4'b1110 -`define ALU_SLL 4'b1111 -`define ALU_OTHER 4'b0111 -`define ALU_BITS 4 -`define ALU_OP(x) x[`ALU_BITS-1:0] -`define ALU_OP_CLASS(x) x[3:2] -`define ALU_SIGNED(x) x[0] -`define ALU_IS_BR(x) x[0] -`define ALU_IS_MUL(x) x[1] +`define INST_BR_EQ 4'b0000 +`define INST_BR_NE 4'b0010 +`define INST_BR_LTU 4'b0100 +`define INST_BR_GEU 4'b0110 +`define INST_BR_LT 4'b0101 +`define INST_BR_GE 4'b0111 +`define INST_BR_JAL 4'b1000 +`define INST_BR_JALR 4'b1001 +`define INST_BR_ECALL 4'b1010 +`define INST_BR_EBREAK 4'b1011 +`define INST_BR_MRET 4'b1100 +`define INST_BR_SRET 4'b1101 +`define INST_BR_DRET 4'b1110 +`define INST_BR_OTHER 4'b1111 +`define INST_BR_BITS 4 +`define INST_BR_OP(x) x[`INST_BR_BITS-1:0] +`define INST_BR_NEG(x) x[1] +`define INST_BR_LESS(x) x[2] +`define INST_BR_STATIC(x) x[3] -`define BR_EQ 4'b0000 -`define BR_NE 4'b0010 -`define BR_LTU 4'b0100 -`define BR_GEU 4'b0110 -`define BR_LT 4'b0101 -`define BR_GE 4'b0111 -`define BR_JAL 4'b1000 -`define BR_JALR 4'b1001 -`define BR_ECALL 4'b1010 -`define BR_EBREAK 4'b1011 -`define BR_MRET 4'b1100 -`define BR_SRET 4'b1101 -`define BR_DRET 4'b1110 -`define BR_OTHER 4'b1111 -`define BR_BITS 4 -`define BR_OP(x) x[`BR_BITS-1:0] -`define BR_NEG(x) x[1] -`define BR_LESS(x) x[2] -`define BR_STATIC(x) x[3] +`define INST_MUL_MUL 3'h0 +`define INST_MUL_MULH 3'h1 +`define INST_MUL_MULHSU 3'h2 +`define INST_MUL_MULHU 3'h3 +`define INST_MUL_DIV 3'h4 +`define INST_MUL_DIVU 3'h5 +`define INST_MUL_REM 3'h6 +`define INST_MUL_REMU 3'h7 +`define INST_MUL_BITS 3 +`define INST_MUL_OP(x) x[`INST_MUL_BITS-1:0] +`define INST_MUL_IS_DIV(x) x[2] -`define MUL_MUL 3'h0 -`define MUL_MULH 3'h1 -`define MUL_MULHSU 3'h2 -`define MUL_MULHU 3'h3 -`define MUL_DIV 3'h4 -`define MUL_DIVU 3'h5 -`define MUL_REM 3'h6 -`define MUL_REMU 3'h7 -`define MUL_BITS 3 -`define MUL_OP(x) x[`MUL_BITS-1:0] -`define MUL_IS_DIV(x) x[2] +`define INST_FMT_B 3'b000 +`define INST_FMT_H 3'b001 +`define INST_FMT_W 3'b010 +`define INST_FMT_BU 3'b100 +`define INST_FMT_HU 3'b101 -`define FMT_B 3'b000 -`define FMT_H 3'b001 -`define FMT_W 3'b010 -`define FMT_BU 3'b100 -`define FMT_HU 3'b101 +`define INST_LSU_LB 4'b0000 +`define INST_LSU_LH 4'b0001 +`define INST_LSU_LW 4'b0010 +`define INST_LSU_LBU 4'b0100 +`define INST_LSU_LHU 4'b0101 +`define INST_LSU_SB 4'b1000 +`define INST_LSU_SH 4'b1001 +`define INST_LSU_SW 4'b1010 +`define INST_LSU_BITS 4 +`define INST_LSU_FMT(x) x[2:0] +`define INST_LSU_WSIZE(x) x[1:0] +`define INST_LSU_OP(x) x[`INST_LSU_BITS-1:0] +`define INST_LSU_IS_FENCE(x) x[0] -`define LSU_LB 4'b0000 -`define LSU_LH 4'b0001 -`define LSU_LW 4'b0010 -`define LSU_LBU 4'b0100 -`define LSU_LHU 4'b0101 -`define LSU_SB 4'b1000 -`define LSU_SH 4'b1001 -`define LSU_SW 4'b1010 -`define LSU_BITS 4 -`define LSU_FMT(x) x[2:0] -`define LSU_WSIZE(x) x[1:0] -`define LSU_OP(x) x[`LSU_BITS-1:0] -`define LSU_IS_FENCE(x) x[0] +`define INST_FENCE_BITS 1 +`define INST_FENCE_D 1'h0 +`define INST_FENCE_I 1'h1 -`define FENCE_BITS 1 -`define FENCE_D 1'h0 -`define FENCE_I 1'h1 +`define INST_CSR_RW 2'h1 +`define INST_CSR_RS 2'h2 +`define INST_CSR_RC 2'h3 +`define INST_CSR_OTHER 2'h0 +`define INST_CSR_BITS 2 +`define INST_CSR_OP(x) x[`INST_CSR_BITS-1:0] -`define CSR_RW 2'h1 -`define CSR_RS 2'h2 -`define CSR_RC 2'h3 -`define CSR_OTHER 2'h0 -`define CSR_BITS 2 -`define CSR_OP(x) x[`CSR_BITS-1:0] +`define INST_FPU_ADD 4'h0 +`define INST_FPU_SUB 4'h4 +`define INST_FPU_MUL 4'h8 +`define INST_FPU_DIV 4'hC +`define INST_FPU_CVTWS 4'h1 // FCVT.W.S +`define INST_FPU_CVTWUS 4'h5 // FCVT.WU.S +`define INST_FPU_CVTSW 4'h9 // FCVT.S.W +`define INST_FPU_CVTSWU 4'hD // FCVT.S.WU +`define INST_FPU_SQRT 4'h2 +`define INST_FPU_CLASS 4'h6 +`define INST_FPU_CMP 4'hA +`define INST_FPU_MISC 4'hE // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX +`define INST_FPU_MADD 4'h3 +`define INST_FPU_MSUB 4'h7 +`define INST_FPU_NMSUB 4'hB +`define INST_FPU_NMADD 4'hF +`define INST_FPU_BITS 4 +`define INST_FPU_OP(x) x[`INST_FPU_BITS-1:0] -`define FPU_ADD 4'h0 -`define FPU_SUB 4'h4 -`define FPU_MUL 4'h8 -`define FPU_DIV 4'hC -`define FPU_CVTWS 4'h1 // FCVT.W.S -`define FPU_CVTWUS 4'h5 // FCVT.WU.S -`define FPU_CVTSW 4'h9 // FCVT.S.W -`define FPU_CVTSWU 4'hD // FCVT.S.WU -`define FPU_SQRT 4'h2 -`define FPU_CLASS 4'h6 -`define FPU_CMP 4'hA -`define FPU_MISC 4'hE // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX -`define FPU_MADD 4'h3 -`define FPU_MSUB 4'h7 -`define FPU_NMSUB 4'hB -`define FPU_NMADD 4'hF -`define FPU_BITS 4 -`define FPU_OP(x) x[`FPU_BITS-1:0] - -`define GPU_TMC 3'h0 -`define GPU_WSPAWN 3'h1 -`define GPU_SPLIT 3'h2 -`define GPU_JOIN 3'h3 -`define GPU_BAR 3'h4 -`define GPU_OTHER 3'h7 -`define GPU_BITS 3 -`define GPU_OP(x) x[`GPU_BITS-1:0] +`define INST_GPU_TMC 3'h0 +`define INST_GPU_WSPAWN 3'h1 +`define INST_GPU_SPLIT 3'h2 +`define INST_GPU_JOIN 3'h3 +`define INST_GPU_BAR 3'h4 +`define INST_GPU_OTHER 3'h7 +`define INST_GPU_BITS 3 +`define INST_GPU_OP(x) x[`INST_GPU_BITS-1:0] /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index dfb45259..730f9df1 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -133,8 +133,8 @@ module VX_execute #( // special workaround to get RISC-V tests Pass/Fail status wire ebreak /* verilator public */; assign ebreak = alu_req_if.valid && alu_req_if.ready - && `ALU_IS_BR(alu_req_if.op_mod) - && (`BR_OP(alu_req_if.op_type) == `BR_EBREAK - || `BR_OP(alu_req_if.op_type) == `BR_ECALL); + && `INST_ALU_IS_BR(alu_req_if.op_mod) + && (`INST_BR_OP(alu_req_if.op_type) == `INST_BR_EBREAK + || `INST_BR_OP(alu_req_if.op_type) == `INST_BR_ECALL); endmodule diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index a1e991c4..cba4d399 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -61,7 +61,7 @@ module VX_fpu_unit #( // resolve dynamic FRM from CSR assign fpu_to_csr_if.read_wid = fpu_req_if.wid; - wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod; + wire [`INST_FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `INST_FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod; `ifdef FPU_DPI @@ -179,7 +179,7 @@ module VX_fpu_unit #( wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFG_BITS), + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index 403bed95..e63f8e1b 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -25,10 +25,10 @@ module VX_gpu_unit #( gpu_barrier_t barrier; gpu_split_t split; - wire is_wspawn = (gpu_req_if.op_type == `GPU_WSPAWN); - wire is_tmc = (gpu_req_if.op_type == `GPU_TMC); - wire is_split = (gpu_req_if.op_type == `GPU_SPLIT); - wire is_bar = (gpu_req_if.op_type == `GPU_BAR); + wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN); + wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC); + wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT); + wire is_bar = (gpu_req_if.op_type == `INST_GPU_BAR); // tmc diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 7b62c3a4..8aaa02bb 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -15,7 +15,7 @@ module VX_ibuffer #( `UNUSED_PARAM (CORE_ID) - localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `OP_BITS + `FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS; + localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS; localparam ADDRW = $clog2(`IBUF_SIZE+1); localparam NWARPSW = $clog2(`NUM_WARPS+1); diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index e497bfcb..6b0423d7 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -41,15 +41,15 @@ module VX_instr_demux ( wire alu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_ALU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), .OUTPUT_REG (1) ) alu_buffer ( .clk (clk), .reset (reset), .valid_in (alu_req_valid), .ready_in (alu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `ALU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), - .data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `INST_ALU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), + .data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), .valid_out (alu_req_if.valid), .ready_out (alu_req_if.ready) ); @@ -57,18 +57,18 @@ module VX_instr_demux ( // lsu unit wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU); - wire lsu_is_fence = `LSU_IS_FENCE(ibuffer_if.op_mod); + wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)), .OUTPUT_REG (1) ) lsu_buffer ( .clk (clk), .reset (reset), .valid_in (lsu_req_valid), .ready_in (lsu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `LSU_OP(ibuffer_if.op_type), lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), - .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `INST_LSU_OP(ibuffer_if.op_type), lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), + .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}), .valid_out (lsu_req_if.valid), .ready_out (lsu_req_if.ready) ); @@ -78,15 +78,15 @@ module VX_instr_demux ( wire csr_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_CSR); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32), .OUTPUT_REG (1) ) csr_buffer ( .clk (clk), .reset (reset), .valid_in (csr_req_valid), .ready_in (csr_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `CSR_OP(ibuffer_if.op_type), ibuffer_if.imm[`CSR_ADDR_BITS-1:0], ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, ibuffer_if.rs1, gpr_rsp_if.rs1_data[0]}), - .data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.rs1, csr_req_if.rs1_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `INST_CSR_OP(ibuffer_if.op_type), ibuffer_if.imm[`CSR_ADDR_BITS-1:0], ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, ibuffer_if.rs1, gpr_rsp_if.rs1_data[0]}), + .data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.rs1, csr_req_if.rs1_data}), .valid_out (csr_req_if.valid), .ready_out (csr_req_if.ready) ); @@ -97,15 +97,15 @@ module VX_instr_demux ( wire fpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_FPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), .OUTPUT_REG (1) ) fpu_buffer ( .clk (clk), .reset (reset), .valid_in (fpu_req_valid), .ready_in (fpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `FPU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `INST_FPU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), .valid_out (fpu_req_if.valid), .ready_out (fpu_req_if.ready) ); @@ -118,15 +118,15 @@ module VX_instr_demux ( wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)), .OUTPUT_REG (1) ) gpu_buffer ( .clk (clk), .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `INST_GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}), + .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) ); diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 8adc6eed..b9a2ef5e 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -33,7 +33,7 @@ module VX_lsu_unit #( wire req_valid; wire [`NUM_THREADS-1:0] req_tmask; wire [`NUM_THREADS-1:0][31:0] req_addr; - wire [`LSU_BITS-1:0] req_type; + wire [`INST_LSU_BITS-1:0] req_type; wire [`NUM_THREADS-1:0][31:0] req_data; wire [`NR_BITS-1:0] req_rd; wire req_wb; @@ -80,7 +80,7 @@ module VX_lsu_unit #( wire lsu_valid = lsu_req_if.valid && ~fence_wait; VX_pipe_register #( - .DATAW (1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), @@ -97,7 +97,7 @@ module VX_lsu_unit #( wire [31:0] rsp_pc; wire [`NR_BITS-1:0] rsp_rd; wire rsp_wb; - wire [`LSU_BITS-1:0] rsp_type; + wire [`INST_LSU_BITS-1:0] rsp_type; wire rsp_is_dup; `UNUSED_VAR (rsp_type) @@ -132,8 +132,8 @@ module VX_lsu_unit #( assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; VX_index_buffer #( - .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1), - .SIZE (`LSUQ_SIZE) + .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1), + .SIZE (`LSUQ_SIZE) ) req_metadata ( .clk (clk), .reset (reset), @@ -202,7 +202,7 @@ module VX_lsu_unit #( always @(*) begin mem_req_byteen = {4{req_wb}}; - case (`LSU_WSIZE(req_type)) + case (`INST_LSU_WSIZE(req_type)) 0: mem_req_byteen[req_offset[i]] = 1; 1: begin mem_req_byteen[req_offset[i]] = 1; @@ -261,11 +261,11 @@ module VX_lsu_unit #( wire [7:0] rsp_data8 = rsp_offset[i][0] ? rsp_data16[15:8] : rsp_data16[7:0]; always @(*) begin - case (`LSU_FMT(rsp_type)) - `FMT_B: rsp_data[i] = 32'(signed'(rsp_data8)); - `FMT_H: rsp_data[i] = 32'(signed'(rsp_data16)); - `FMT_BU: rsp_data[i] = 32'(unsigned'(rsp_data8)); - `FMT_HU: rsp_data[i] = 32'(unsigned'(rsp_data16)); + case (`INST_LSU_FMT(rsp_type)) + `INST_FMT_B: rsp_data[i] = 32'(signed'(rsp_data8)); + `INST_FMT_H: rsp_data[i] = 32'(signed'(rsp_data16)); + `INST_FMT_BU: rsp_data[i] = 32'(unsigned'(rsp_data8)); + `INST_FMT_HU: rsp_data[i] = 32'(unsigned'(rsp_data16)); default: rsp_data[i] = rsp_data32; endcase end diff --git a/hw/rtl/VX_muldiv.v b/hw/rtl/VX_muldiv.v index 8a0a466e..107e6785 100644 --- a/hw/rtl/VX_muldiv.v +++ b/hw/rtl/VX_muldiv.v @@ -5,7 +5,7 @@ module VX_muldiv ( input wire reset, // Inputs - input wire [`MUL_BITS-1:0] alu_op, + input wire [`INST_MUL_BITS-1:0] alu_op, input wire [`NW_BITS-1:0] wid_in, input wire [`NUM_THREADS-1:0] tmask_in, input wire [31:0] PC_in, @@ -29,7 +29,7 @@ module VX_muldiv ( input wire ready_out ); - wire is_div_op = `MUL_IS_DIV(alu_op); + wire is_div_op = `INST_MUL_IS_DIV(alu_op); wire [`NUM_THREADS-1:0][31:0] mul_result; wire [`NW_BITS-1:0] mul_wid_out; @@ -44,9 +44,9 @@ module VX_muldiv ( wire mul_valid_in = valid_in && !is_div_op; wire mul_ready_in = ~stall_out || ~mul_valid_out; - wire is_mulh_in = (alu_op != `MUL_MUL); - wire is_signed_mul_a = (alu_op != `MUL_MULHU); - wire is_signed_mul_b = (alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU); + wire is_mulh_in = (alu_op != `INST_MUL_MUL); + wire is_signed_mul_a = (alu_op != `INST_MUL_MULHU); + wire is_signed_mul_b = (alu_op != `INST_MUL_MULHU && alu_op != `INST_MUL_MULHSU); `ifdef IMUL_DPI @@ -123,8 +123,8 @@ module VX_muldiv ( wire [`NR_BITS-1:0] div_rd_out; wire div_wb_out; - wire is_rem_op_in = (alu_op == `MUL_REM) || (alu_op == `MUL_REMU); - wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM); + wire is_rem_op_in = (alu_op == `INST_MUL_REM) || (alu_op == `INST_MUL_REMU); + wire is_signed_div = (alu_op == `INST_MUL_DIV) || (alu_op == `INST_MUL_REM); wire div_valid_in = valid_in && is_div_op; wire div_ready_out = ~stall_out && ~mul_valid_out; // arbitration prioritizes MUL wire div_ready_in; diff --git a/hw/rtl/VX_print_instr.vh b/hw/rtl/VX_print_instr.vh index 2931bc9f..24fc73a0 100644 --- a/hw/rtl/VX_print_instr.vh +++ b/hw/rtl/VX_print_instr.vh @@ -18,104 +18,104 @@ endtask task print_ex_op ( input [`EX_BITS-1:0] ex_type, - input [`OP_BITS-1:0] op_type, - input [`MOD_BITS-1:0] op_mod + input [`INST_OP_BITS-1:0] op_type, + input [`INST_MOD_BITS-1:0] op_mod ); case (ex_type) `EX_ALU: begin - if (`ALU_IS_BR(op_mod)) begin - case (`BR_BITS'(op_type)) - `BR_EQ: dpi_trace("BEQ"); - `BR_NE: dpi_trace("BNE"); - `BR_LT: dpi_trace("BLT"); - `BR_GE: dpi_trace("BGE"); - `BR_LTU: dpi_trace("BLTU"); - `BR_GEU: dpi_trace("BGEU"); - `BR_JAL: dpi_trace("JAL"); - `BR_JALR: dpi_trace("JALR"); - `BR_ECALL: dpi_trace("ECALL"); - `BR_EBREAK:dpi_trace("EBREAK"); - `BR_MRET: dpi_trace("MRET"); - `BR_SRET: dpi_trace("SRET"); - `BR_DRET: dpi_trace("DRET"); + if (`INST_ALU_IS_BR(op_mod)) begin + case (`INST_BR_BITS'(op_type)) + `INST_BR_EQ: dpi_trace("BEQ"); + `INST_BR_NE: dpi_trace("BNE"); + `INST_BR_LT: dpi_trace("BLT"); + `INST_BR_GE: dpi_trace("BGE"); + `INST_BR_LTU: dpi_trace("BLTU"); + `INST_BR_GEU: dpi_trace("BGEU"); + `INST_BR_JAL: dpi_trace("JAL"); + `INST_BR_JALR: dpi_trace("JALR"); + `INST_BR_ECALL: dpi_trace("ECALL"); + `INST_BR_EBREAK:dpi_trace("EBREAK"); + `INST_BR_MRET: dpi_trace("MRET"); + `INST_BR_SRET: dpi_trace("SRET"); + `INST_BR_DRET: dpi_trace("DRET"); default: dpi_trace("?"); endcase - end else if (`ALU_IS_MUL(op_mod)) begin - case (`MUL_BITS'(op_type)) - `MUL_MUL: dpi_trace("MUL"); - `MUL_MULH: dpi_trace("MULH"); - `MUL_MULHSU:dpi_trace("MULHSU"); - `MUL_MULHU: dpi_trace("MULHU"); - `MUL_DIV: dpi_trace("DIV"); - `MUL_DIVU: dpi_trace("DIVU"); - `MUL_REM: dpi_trace("REM"); - `MUL_REMU: dpi_trace("REMU"); + end else if (`INST_ALU_IS_MUL(op_mod)) begin + case (`INST_MUL_BITS'(op_type)) + `INST_MUL_MUL: dpi_trace("MUL"); + `INST_MUL_MULH: dpi_trace("MULH"); + `INST_MUL_MULHSU:dpi_trace("MULHSU"); + `INST_MUL_MULHU: dpi_trace("MULHU"); + `INST_MUL_DIV: dpi_trace("DIV"); + `INST_MUL_DIVU: dpi_trace("DIVU"); + `INST_MUL_REM: dpi_trace("REM"); + `INST_MUL_REMU: dpi_trace("REMU"); default: dpi_trace("?"); endcase end else begin - case (`ALU_BITS'(op_type)) - `ALU_ADD: dpi_trace("ADD"); - `ALU_SUB: dpi_trace("SUB"); - `ALU_SLL: dpi_trace("SLL"); - `ALU_SRL: dpi_trace("SRL"); - `ALU_SRA: dpi_trace("SRA"); - `ALU_SLT: dpi_trace("SLT"); - `ALU_SLTU: dpi_trace("SLTU"); - `ALU_XOR: dpi_trace("XOR"); - `ALU_OR: dpi_trace("OR"); - `ALU_AND: dpi_trace("AND"); - `ALU_LUI: dpi_trace("LUI"); - `ALU_AUIPC: dpi_trace("AUIPC"); + case (`INST_ALU_BITS'(op_type)) + `INST_ALU_ADD: dpi_trace("ADD"); + `INST_ALU_SUB: dpi_trace("SUB"); + `INST_ALU_SLL: dpi_trace("SLL"); + `INST_ALU_SRL: dpi_trace("SRL"); + `INST_ALU_SRA: dpi_trace("SRA"); + `INST_ALU_SLT: dpi_trace("SLT"); + `INST_ALU_SLTU: dpi_trace("SLTU"); + `INST_ALU_XOR: dpi_trace("XOR"); + `INST_ALU_OR: dpi_trace("OR"); + `INST_ALU_AND: dpi_trace("AND"); + `INST_ALU_LUI: dpi_trace("LUI"); + `INST_ALU_AUIPC: dpi_trace("AUIPC"); default: dpi_trace("?"); endcase end end `EX_LSU: begin if (op_mod == 0) begin - case (`LSU_BITS'(op_type)) - `LSU_LB: dpi_trace("LB"); - `LSU_LH: dpi_trace("LH"); - `LSU_LW: dpi_trace("LW"); - `LSU_LBU:dpi_trace("LBU"); - `LSU_LHU:dpi_trace("LHU"); - `LSU_SB: dpi_trace("SB"); - `LSU_SH: dpi_trace("SH"); - `LSU_SW: dpi_trace("SW"); + case (`INST_LSU_BITS'(op_type)) + `INST_LSU_LB: dpi_trace("LB"); + `INST_LSU_LH: dpi_trace("LH"); + `INST_LSU_LW: dpi_trace("LW"); + `INST_LSU_LBU:dpi_trace("LBU"); + `INST_LSU_LHU:dpi_trace("LHU"); + `INST_LSU_SB: dpi_trace("SB"); + `INST_LSU_SH: dpi_trace("SH"); + `INST_LSU_SW: dpi_trace("SW"); default: dpi_trace("?"); endcase end else if (op_mod == 1) begin - case (`FENCE_BITS'(op_type)) - `FENCE_D: dpi_trace("DFENCE"); - `FENCE_I: dpi_trace("IFENCE"); + case (`INST_FENCE_BITS'(op_type)) + `INST_FENCE_D: dpi_trace("DFENCE"); + `INST_FENCE_I: dpi_trace("IFENCE"); default: dpi_trace("?"); endcase end end `EX_CSR: begin - case (`CSR_BITS'(op_type)) - `CSR_RW: dpi_trace("CSRW"); - `CSR_RS: dpi_trace("CSRS"); - `CSR_RC: dpi_trace("CSRC"); + case (`INST_CSR_BITS'(op_type)) + `INST_CSR_RW: dpi_trace("CSRW"); + `INST_CSR_RS: dpi_trace("CSRS"); + `INST_CSR_RC: dpi_trace("CSRC"); default: dpi_trace("?"); endcase end `EX_FPU: begin - case (`FPU_BITS'(op_type)) - `FPU_ADD: dpi_trace("ADD"); - `FPU_SUB: dpi_trace("SUB"); - `FPU_MUL: dpi_trace("MUL"); - `FPU_DIV: dpi_trace("DIV"); - `FPU_SQRT: dpi_trace("SQRT"); - `FPU_MADD: dpi_trace("MADD"); - `FPU_NMSUB: dpi_trace("NMSUB"); - `FPU_NMADD: dpi_trace("NMADD"); - `FPU_CVTWS: dpi_trace("CVTWS"); - `FPU_CVTWUS:dpi_trace("CVTWUS"); - `FPU_CVTSW: dpi_trace("CVTSW"); - `FPU_CVTSWU:dpi_trace("CVTSWU"); - `FPU_CLASS: dpi_trace("CLASS"); - `FPU_CMP: dpi_trace("CMP"); - `FPU_MISC: begin + case (`INST_FPU_BITS'(op_type)) + `INST_FPU_ADD: dpi_trace("ADD"); + `INST_FPU_SUB: dpi_trace("SUB"); + `INST_FPU_MUL: dpi_trace("MUL"); + `INST_FPU_DIV: dpi_trace("DIV"); + `INST_FPU_SQRT: dpi_trace("SQRT"); + `INST_FPU_MADD: dpi_trace("MADD"); + `INST_FPU_NMSUB: dpi_trace("NMSUB"); + `INST_FPU_NMADD: dpi_trace("NMADD"); + `INST_FPU_CVTWS: dpi_trace("CVTWS"); + `INST_FPU_CVTWUS:dpi_trace("CVTWUS"); + `INST_FPU_CVTSW: dpi_trace("CVTSW"); + `INST_FPU_CVTSWU:dpi_trace("CVTSWU"); + `INST_FPU_CLASS: dpi_trace("CLASS"); + `INST_FPU_CMP: dpi_trace("CMP"); + `INST_FPU_MISC: begin case (op_mod) 0: dpi_trace("SGNJ"); 1: dpi_trace("SGNJN"); @@ -130,12 +130,12 @@ task print_ex_op ( endcase end `EX_GPU: begin - case (`GPU_BITS'(op_type)) - `GPU_TMC: dpi_trace("TMC"); - `GPU_WSPAWN:dpi_trace("WSPAWN"); - `GPU_SPLIT: dpi_trace("SPLIT"); - `GPU_JOIN: dpi_trace("JOIN"); - `GPU_BAR: dpi_trace("BAR"); + case (`INST_GPU_BITS'(op_type)) + `INST_GPU_TMC: dpi_trace("TMC"); + `INST_GPU_WSPAWN:dpi_trace("WSPAWN"); + `INST_GPU_SPLIT: dpi_trace("SPLIT"); + `INST_GPU_JOIN: dpi_trace("JOIN"); + `INST_GPU_BAR: dpi_trace("BAR"); default: dpi_trace("?"); endcase end diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index d444db0d..785ce444 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -21,7 +21,7 @@ typedef struct packed { logic NX; // 0-Inexact } fflags_t; -`define FFG_BITS $bits(fflags_t) +`define FFLAGS_BITS $bits(fflags_t) typedef struct packed { logic valid; diff --git a/hw/rtl/fp_cores/VX_fp_cvt.v b/hw/rtl/fp_cores/VX_fp_cvt.v index 1e823363..25e178c3 100644 --- a/hw/rtl/fp_cores/VX_fp_cvt.v +++ b/hw/rtl/fp_cores/VX_fp_cvt.v @@ -15,7 +15,7 @@ module VX_fp_cvt #( input wire [TAGW-1:0] tag_in, - input wire [`FRM_BITS-1:0] frm, + input wire [`INST_FRM_BITS-1:0] frm, input wire is_itof, input wire is_signed, @@ -101,7 +101,7 @@ module VX_fp_cvt #( wire stall; VX_pipe_register #( - .DATAW (1 + TAGW + 1 + `FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)), + .DATAW (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)), .RESETW (1) ) pipe_reg0 ( .clk (clk), @@ -167,7 +167,7 @@ module VX_fp_cvt #( wire [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1; VX_pipe_register #( - .DATAW (1 + TAGW + 1 + `FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)), + .DATAW (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)), .RESETW (1) ) pipe_reg1 ( .clk (clk), @@ -253,7 +253,7 @@ module VX_fp_cvt #( wire [LANES-1:0] of_before_round_s2; VX_pipe_register #( - .DATAW (1 + TAGW + 1 + 1 + `FRM_BITS + LANES * ($bits(fp_type_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)), + .DATAW (1 + TAGW + 1 + 1 + `INST_FRM_BITS + LANES * ($bits(fp_type_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)), .RESETW (1) ) pipe_reg2 ( .clk (clk), @@ -435,7 +435,7 @@ module VX_fp_cvt #( assign stall = ~ready_out && valid_out; VX_pipe_register #( - .DATAW (1 + TAGW + (LANES * 32) + (LANES * `FFG_BITS)), + .DATAW (1 + TAGW + (LANES * 32) + (LANES * `FFLAGS_BITS)), .RESETW (1) ) pipe_reg4 ( .clk (clk), diff --git a/hw/rtl/fp_cores/VX_fp_div.v b/hw/rtl/fp_cores/VX_fp_div.v index 8dd110d3..163b4c0e 100644 --- a/hw/rtl/fp_cores/VX_fp_div.v +++ b/hw/rtl/fp_cores/VX_fp_div.v @@ -16,7 +16,7 @@ module VX_fp_div #( input wire [TAGW-1:0] tag_in, - input wire [`FRM_BITS-1:0] frm, + input wire [`INST_FRM_BITS-1:0] frm, input wire [LANES-1:0][31:0] dataa, input wire [LANES-1:0][31:0] datab, diff --git a/hw/rtl/fp_cores/VX_fp_fma.v b/hw/rtl/fp_cores/VX_fp_fma.v index faf26e87..13ee473b 100644 --- a/hw/rtl/fp_cores/VX_fp_fma.v +++ b/hw/rtl/fp_cores/VX_fp_fma.v @@ -16,7 +16,7 @@ module VX_fp_fma #( input wire [TAGW-1:0] tag_in, - input wire [`FRM_BITS-1:0] frm, + input wire [`INST_FRM_BITS-1:0] frm, input wire do_madd, input wire do_sub, diff --git a/hw/rtl/fp_cores/VX_fp_ncomp.v b/hw/rtl/fp_cores/VX_fp_ncomp.v index faf6ff38..11300a87 100644 --- a/hw/rtl/fp_cores/VX_fp_ncomp.v +++ b/hw/rtl/fp_cores/VX_fp_ncomp.v @@ -15,8 +15,8 @@ module VX_fp_ncomp #( input wire [TAGW-1:0] tag_in, - input wire [`FPU_BITS-1:0] op_type, - input wire [`FRM_BITS-1:0] frm, + input wire [`INST_FPU_BITS-1:0] op_type, + input wire [`INST_FRM_BITS-1:0] frm, input wire [LANES-1:0][31:0] dataa, input wire [LANES-1:0][31:0] datab, @@ -77,8 +77,8 @@ module VX_fp_ncomp #( wire valid_in_s0; wire [TAGW-1:0] tag_in_s0; - wire [`FPU_BITS-1:0] op_type_s0; - wire [`FRM_BITS-1:0] frm_s0; + wire [`INST_FPU_BITS-1:0] op_type_s0; + wire [`INST_FRM_BITS-1:0] frm_s0; wire [LANES-1:0][31:0] dataa_s0, datab_s0; wire [LANES-1:0] a_sign_s0, b_sign_s0; wire [LANES-1:0][7:0] a_exponent_s0; @@ -89,7 +89,7 @@ module VX_fp_ncomp #( wire stall; VX_pipe_register #( - .DATAW (1 + TAGW + `FPU_BITS + `FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_type_t) + 1 + 1)), + .DATAW (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_type_t) + 1 + 1)), .RESETW (1), .DEPTH (0) ) pipe_reg0 ( @@ -164,7 +164,7 @@ module VX_fp_ncomp #( for (genvar i = 0; i < LANES; i++) begin always @(*) begin case (frm_s0) - `FRM_RNE: begin // LE + `INST_FRM_RNE: begin // LE fcmp_fflags[i] = 5'h0; if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin fcmp_res[i] = 32'h0; @@ -173,7 +173,7 @@ module VX_fp_ncomp #( fcmp_res[i] = {31'h0, (a_smaller_s0[i] | ab_equal_s0[i])}; end end - `FRM_RTZ: begin // LS + `INST_FRM_RTZ: begin // LS fcmp_fflags[i] = 5'h0; if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin fcmp_res[i] = 32'h0; @@ -182,7 +182,7 @@ module VX_fp_ncomp #( fcmp_res[i] = {31'h0, (a_smaller_s0[i] & ~ab_equal_s0[i])}; end end - `FRM_RDN: begin // EQ + `INST_FRM_RDN: begin // EQ fcmp_fflags[i] = 5'h0; if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin fcmp_res[i] = 32'h0; @@ -207,11 +207,11 @@ module VX_fp_ncomp #( for (genvar i = 0; i < LANES; i++) begin always @(*) begin case (op_type_s0) - `FPU_CLASS: begin + `INST_FPU_CLASS: begin tmp_result[i] = fclass_mask[i]; tmp_fflags[i] = 'x; end - `FPU_CMP: begin + `INST_FPU_CMP: begin tmp_result[i] = fcmp_res[i]; tmp_fflags[i] = fcmp_fflags[i]; end @@ -238,15 +238,15 @@ module VX_fp_ncomp #( end end - wire has_fflags_s0 = ((op_type_s0 == `FPU_MISC) - && (frm_s0 == 3 // MIN - || frm_s0 == 4)) // MAX - || (op_type_s0 == `FPU_CMP); // CMP + wire has_fflags_s0 = ((op_type_s0 == `INST_FPU_MISC) + && (frm_s0 == 3 // MIN + || frm_s0 == 4)) // MAX + || (op_type_s0 == `INST_FPU_CMP); // CMP assign stall = ~ready_out && valid_out; VX_pipe_register #( - .DATAW (1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS)), + .DATAW (1 + TAGW + (LANES * 32) + 1 + (LANES * `FFLAGS_BITS)), .RESETW (1) ) pipe_reg1 ( .clk (clk), diff --git a/hw/rtl/fp_cores/VX_fp_rounding.v b/hw/rtl/fp_cores/VX_fp_rounding.v index 9e544e44..654f6e8d 100644 --- a/hw/rtl/fp_cores/VX_fp_rounding.v +++ b/hw/rtl/fp_cores/VX_fp_rounding.v @@ -34,7 +34,7 @@ module VX_fp_rounding #( always @(*) begin case (rnd_mode_i) - `FRM_RNE: // Decide accoring to round/sticky bits + `INST_FRM_RNE: // Decide accoring to round/sticky bits case (round_sticky_bits_i) 2'b00, 2'b01: round_up = 1'b0; // < ulp/2 away, round down @@ -42,10 +42,10 @@ module VX_fp_rounding #( 2'b11: round_up = 1'b1; // > ulp/2 away, round up default: round_up = 1'bx; endcase - `FRM_RTZ: round_up = 1'b0; // always round down - `FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if - - `FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if + - `FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up + `INST_FRM_RTZ: round_up = 1'b0; // always round down + `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if - + `INST_FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if + + `INST_FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up default: round_up = 1'bx; // propagate x endcase end @@ -58,7 +58,7 @@ module VX_fp_rounding #( // In case of effective subtraction (thus signs of addition operands must have differed) and a // true zero result, the result sign is '-' in case of RDN and '+' for other modes. - assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `FRM_RDN) + assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `INST_FRM_RDN) : sign_i; endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fp_sqrt.v b/hw/rtl/fp_cores/VX_fp_sqrt.v index 1debc04d..dc1b2bcb 100644 --- a/hw/rtl/fp_cores/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/VX_fp_sqrt.v @@ -16,7 +16,7 @@ module VX_fp_sqrt #( input wire [TAGW-1:0] tag_in, - input wire [`FRM_BITS-1:0] frm, + input wire [`INST_FRM_BITS-1:0] frm, input wire [LANES-1:0][31:0] dataa, output wire [LANES-1:0][31:0] result, diff --git a/hw/rtl/fp_cores/VX_fpu_dpi.v b/hw/rtl/fp_cores/VX_fpu_dpi.v index 10dab769..bd87485b 100644 --- a/hw/rtl/fp_cores/VX_fpu_dpi.v +++ b/hw/rtl/fp_cores/VX_fpu_dpi.v @@ -14,8 +14,8 @@ module VX_fpu_dpi #( input wire [TAGW-1:0] tag_in, - input wire [`FPU_BITS-1:0] op_type, - input wire [`MOD_BITS-1:0] frm, + input wire [`INST_FPU_BITS-1:0] op_type, + input wire [`INST_MOD_BITS-1:0] frm, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, @@ -76,21 +76,21 @@ module VX_fpu_dpi #( is_fsgnjx = 0; case (op_type) - `FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end - `FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end - `FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end - `FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end - `FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end - `FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end - `FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end - `FPU_DIV: begin core_select = FPU_DIV; end - `FPU_SQRT: begin core_select = FPU_SQRT; end - `FPU_CVTWS: begin core_select = FPU_CVT; is_ftoi = 1; end - `FPU_CVTWUS:begin core_select = FPU_CVT; is_ftou = 1; end - `FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; end - `FPU_CVTSWU:begin core_select = FPU_CVT; is_utof = 1; end - `FPU_CLASS: begin core_select = FPU_NCP; is_fclss = 1; end - `FPU_CMP: begin core_select = FPU_NCP; + `INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end + `INST_FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end + `INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end + `INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end + `INST_FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end + `INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end + `INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end + `INST_FPU_DIV: begin core_select = FPU_DIV; end + `INST_FPU_SQRT: begin core_select = FPU_SQRT; end + `INST_FPU_CVTWS: begin core_select = FPU_CVT; is_ftoi = 1; end + `INST_FPU_CVTWUS:begin core_select = FPU_CVT; is_ftou = 1; end + `INST_FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; end + `INST_FPU_CVTSWU:begin core_select = FPU_CVT; is_utof = 1; end + `INST_FPU_CLASS: begin core_select = FPU_NCP; is_fclss = 1; end + `INST_FPU_CMP: begin core_select = FPU_NCP; is_fle = (frm == 0); is_flt = (frm == 1); is_feq = (frm == 2); diff --git a/hw/rtl/fp_cores/VX_fpu_fpga.v b/hw/rtl/fp_cores/VX_fpu_fpga.v index 791d1f4c..f07afb0d 100644 --- a/hw/rtl/fp_cores/VX_fpu_fpga.v +++ b/hw/rtl/fp_cores/VX_fpu_fpga.v @@ -11,8 +11,8 @@ module VX_fpu_fpga #( input wire [TAGW-1:0] tag_in, - input wire [`FPU_BITS-1:0] op_type, - input wire [`MOD_BITS-1:0] frm, + input wire [`INST_FPU_BITS-1:0] op_type, + input wire [`INST_MOD_BITS-1:0] frm, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, @@ -54,19 +54,19 @@ module VX_fpu_fpga #( is_itof = 0; is_signed = 0; case (op_type) - `FPU_ADD: begin core_select = FPU_FMA; end - `FPU_SUB: begin core_select = FPU_FMA; do_sub = 1; end - `FPU_MUL: begin core_select = FPU_FMA; do_neg = 1; end - `FPU_MADD: begin core_select = FPU_FMA; do_madd = 1; end - `FPU_MSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; end - `FPU_NMADD: begin core_select = FPU_FMA; do_madd = 1; do_neg = 1; end - `FPU_NMSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 1; end - `FPU_DIV: begin core_select = FPU_DIV; end - `FPU_SQRT: begin core_select = FPU_SQRT; end - `FPU_CVTWS: begin core_select = FPU_CVT; is_signed = 1; end - `FPU_CVTWUS: begin core_select = FPU_CVT; end - `FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end - `FPU_CVTSWU: begin core_select = FPU_CVT; is_itof = 1; end + `INST_FPU_ADD: begin core_select = FPU_FMA; end + `INST_FPU_SUB: begin core_select = FPU_FMA; do_sub = 1; end + `INST_FPU_MUL: begin core_select = FPU_FMA; do_neg = 1; end + `INST_FPU_MADD: begin core_select = FPU_FMA; do_madd = 1; end + `INST_FPU_MSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; end + `INST_FPU_NMADD: begin core_select = FPU_FMA; do_madd = 1; do_neg = 1; end + `INST_FPU_NMSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 1; end + `INST_FPU_DIV: begin core_select = FPU_DIV; end + `INST_FPU_SQRT: begin core_select = FPU_SQRT; end + `INST_FPU_CVTWS: begin core_select = FPU_CVT; is_signed = 1; end + `INST_FPU_CVTWUS: begin core_select = FPU_CVT; end + `INST_FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end + `INST_FPU_CVTSWU: begin core_select = FPU_CVT; is_itof = 1; end default: begin core_select = FPU_NCP; end endcase end diff --git a/hw/rtl/fp_cores/VX_fpu_fpnew.v b/hw/rtl/fp_cores/VX_fpu_fpnew.v index 3a8a8106..57a73bff 100644 --- a/hw/rtl/fp_cores/VX_fpu_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpu_fpnew.v @@ -19,8 +19,8 @@ module VX_fpu_fpnew input wire [TAGW-1:0] tag_in, - input wire [`FPU_BITS-1:0] op_type, - input wire [`MOD_BITS-1:0] frm, + input wire [`INST_FPU_BITS-1:0] op_type, + input wire [`INST_MOD_BITS-1:0] frm, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, @@ -81,7 +81,7 @@ module VX_fpu_fpnew fpnew_pkg::status_t [`NUM_THREADS-1:0] fpu_status; reg [FOP_BITS-1:0] fpu_op; - reg [`FRM_BITS-1:0] fpu_rnd; + reg [`INST_FRM_BITS-1:0] fpu_rnd; reg fpu_op_mod; reg fpu_has_fflags, fpu_has_fflags_out; @@ -95,38 +95,38 @@ module VX_fpu_fpnew fpu_operands[2] = datac; case (op_type) - `FPU_ADD: begin + `INST_FPU_ADD: begin fpu_op = fpnew_pkg::ADD; fpu_operands[1] = dataa; fpu_operands[2] = datab; end - `FPU_SUB: begin + `INST_FPU_SUB: begin fpu_op = fpnew_pkg::ADD; fpu_operands[1] = dataa; fpu_operands[2] = datab; fpu_op_mod = 1; end - `FPU_MUL: begin fpu_op = fpnew_pkg::MUL; end - `FPU_DIV: begin fpu_op = fpnew_pkg::DIV; end - `FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end - `FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end - `FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end - `FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end - `FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end - `FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end - `FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end - `FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end - `FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end - `FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end - `FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end - `FPU_MISC: begin + `INST_FPU_MUL: begin fpu_op = fpnew_pkg::MUL; end + `INST_FPU_DIV: begin fpu_op = fpnew_pkg::DIV; end + `INST_FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end + `INST_FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end + `INST_FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end + `INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end + `INST_FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end + `INST_FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end + `INST_FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end + `INST_FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end + `INST_FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end + `INST_FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end + `INST_FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end + `INST_FPU_MISC: begin case (frm) - 0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end - 1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end - 2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end - 3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end - 4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end - default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end + 0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `INST_FRM_RNE; fpu_has_fflags = 0; end + 1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `INST_FRM_RTZ; fpu_has_fflags = 0; end + 2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `INST_FRM_RDN; fpu_has_fflags = 0; end + 3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `INST_FRM_RNE; end + 4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `INST_FRM_RTZ; end + default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `INST_FRM_RUP; fpu_has_fflags = 0; end endcase end default:; diff --git a/hw/rtl/interfaces/VX_alu_req_if.v b/hw/rtl/interfaces/VX_alu_req_if.v index 2df383fb..8ae714af 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.v +++ b/hw/rtl/interfaces/VX_alu_req_if.v @@ -10,8 +10,8 @@ interface VX_alu_req_if (); wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; wire [31:0] next_PC; - wire [`ALU_BITS-1:0] op_type; - wire [`MOD_BITS-1:0] op_mod; + wire [`INST_ALU_BITS-1:0] op_type; + wire [`INST_MOD_BITS-1:0] op_mod; wire use_PC; wire use_imm; wire [31:0] imm; diff --git a/hw/rtl/interfaces/VX_csr_req_if.v b/hw/rtl/interfaces/VX_csr_req_if.v index c02a67b4..3bcf635a 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.v +++ b/hw/rtl/interfaces/VX_csr_req_if.v @@ -9,7 +9,7 @@ interface VX_csr_req_if (); wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; - wire [`CSR_BITS-1:0] op_type; + wire [`INST_CSR_BITS-1:0] op_type; wire [`CSR_ADDR_BITS-1:0] addr; wire [31:0] rs1_data; wire use_imm; diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index c8465911..e92f4592 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -10,8 +10,8 @@ interface VX_decode_if (); wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; wire [`EX_BITS-1:0] ex_type; - wire [`OP_BITS-1:0] op_type; - wire [`MOD_BITS-1:0] op_mod; + wire [`INST_OP_BITS-1:0] op_type; + wire [`INST_MOD_BITS-1:0] op_mod; wire wb; wire [`NR_BITS-1:0] rd; wire [`NR_BITS-1:0] rs1; diff --git a/hw/rtl/interfaces/VX_fpu_req_if.v b/hw/rtl/interfaces/VX_fpu_req_if.v index f03bddd4..92be96cf 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.v +++ b/hw/rtl/interfaces/VX_fpu_req_if.v @@ -9,8 +9,8 @@ interface VX_fpu_req_if (); wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; - wire [`FPU_BITS-1:0] op_type; - wire [`MOD_BITS-1:0] op_mod; + wire [`INST_FPU_BITS-1:0] op_type; + wire [`INST_MOD_BITS-1:0] op_mod; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs3_data; diff --git a/hw/rtl/interfaces/VX_fpu_to_csr_if.v b/hw/rtl/interfaces/VX_fpu_to_csr_if.v index cd101820..3f27fa02 100644 --- a/hw/rtl/interfaces/VX_fpu_to_csr_if.v +++ b/hw/rtl/interfaces/VX_fpu_to_csr_if.v @@ -10,7 +10,7 @@ interface VX_fpu_to_csr_if (); fflags_t write_fflags; wire [`NW_BITS-1:0] read_wid; - wire [`FRM_BITS-1:0] read_frm; + wire [`INST_FRM_BITS-1:0] read_frm; endinterface diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v index 1b49fda0..27ee1316 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -11,7 +11,7 @@ interface VX_gpu_req_if(); wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; wire [31:0] next_PC; - wire [`GPU_BITS-1:0] op_type; + wire [`INST_GPU_BITS-1:0] op_type; wire [`NT_BITS-1:0] tid; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [31:0] rs2_data; diff --git a/hw/rtl/interfaces/VX_ibuffer_if.v b/hw/rtl/interfaces/VX_ibuffer_if.v index 30f93026..d2277e48 100644 --- a/hw/rtl/interfaces/VX_ibuffer_if.v +++ b/hw/rtl/interfaces/VX_ibuffer_if.v @@ -11,8 +11,8 @@ interface VX_ibuffer_if (); wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; wire [`EX_BITS-1:0] ex_type; - wire [`OP_BITS-1:0] op_type; - wire [`MOD_BITS-1:0] op_mod; + wire [`INST_OP_BITS-1:0] op_type; + wire [`INST_MOD_BITS-1:0] op_mod; wire wb; wire [`NR_BITS-1:0] rd; wire [`NR_BITS-1:0] rs1; diff --git a/hw/rtl/interfaces/VX_lsu_req_if.v b/hw/rtl/interfaces/VX_lsu_req_if.v index c9797b0f..fed1f270 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.v +++ b/hw/rtl/interfaces/VX_lsu_req_if.v @@ -9,7 +9,7 @@ interface VX_lsu_req_if (); wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; - wire [`LSU_BITS-1:0] op_type; + wire [`INST_LSU_BITS-1:0] op_type; wire is_fence; wire [`NUM_THREADS-1:0][31:0] store_data; wire [`NUM_THREADS-1:0][31:0] base_addr; diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index b5b4a11d..4af5231a 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -173,8 +173,8 @@ "issue_tmask":"`NUM_THREADS", "issue_pc": 32, "issue_ex_type":"`EX_BITS", - "issue_op_type":"`OP_BITS", - "issue_op_mod":"`MOD_BITS", + "issue_op_type":"`INST_OP_BITS", + "issue_op_mod":"`INST_MOD_BITS", "issue_wb": 1, "issue_rd":"`NR_BITS", "issue_rs1":"`NR_BITS", From 53c8cddccfc079c673afa273134d87763ed966e5 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 30 Aug 2021 10:25:52 -0700 Subject: [PATCH 16/16] LKG build - minor update --- .travis.yml | 9 ++++++--- ci/regression.sh | 42 +++++++++++++++++++++++++++++------------- hw/rtl/VX_cluster.v | 4 ++-- hw/rtl/VX_define.vh | 26 +++++++++++++------------- 4 files changed, 50 insertions(+), 31 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6d1403c3..b4ee6aaf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,15 +40,18 @@ jobs: - stage: test name: config script: cp -r $PWD ../build4 && cd ../build4 && ./ci/travis_run.py ./ci/regression.sh -config + - stage: test + name: stress0 + script: cp -r $PWD ../build5 && cd ../build5 && ./ci/travis_run.py ./ci/regression.sh -stress0 - stage: test name: stress1 - script: cp -r $PWD ../build5 && cd ../build5 && ./ci/travis_run.py ./ci/regression.sh -stress1 + script: cp -r $PWD ../build6 && cd ../build6 && ./ci/travis_run.py ./ci/regression.sh -stress1 - stage: test name: stress2 - script: cp -r $PWD ../build6 && cd ../build6 && ./ci/travis_run.py ./ci/regression.sh -stress2 + script: cp -r $PWD ../build7 && cd ../build7 && ./ci/travis_run.py ./ci/regression.sh -stress2 - stage: test name: compiler - script: cp -r $PWD ../build7 && cd ../build7 && ./ci/travis_run.py /ci/test_compiler.sh + script: cp -r $PWD ../build8 && cd ../build8 && ./ci/travis_run.py /ci/test_compiler.sh after_success: # Gather code coverage diff --git a/ci/regression.sh b/ci/regression.sh index 7821d626..135dd562 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -97,32 +97,45 @@ CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver= # test 128-bit DRAM block CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo -# test verilator reset values -CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm -CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm - # test long memory latency CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo echo "configuration tests done!" } +stress0() +{ +echo "begin stress0 tests..." + +# test verilator reset values +CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=sgemm +CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=sgemm +FPU_CORE=FPU_DEFAULT CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood +FPU_CORE=FPU_DEFAULT CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood +CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr +CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr +CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=printf +CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=printf + +echo "stress0 tests done!" +} + stress1() { -echo "begin stress tests..." +echo "begin stress1 tests..." ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm --args="-n256" -echo "stress tests done!" +echo "stress1 tests done!" } stress2() { -echo "begin stress tests..." +echo "begin stress2 tests..." ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --clusters=2 --l3cache --app=sgemm --args="-n256" -echo "stress tests done!" +echo "stress2 tests done!" } usage() @@ -133,13 +146,15 @@ usage() while [ "$1" != "" ]; do case $1 in -coverage ) coverage - ;; + ;; -cluster ) cluster - ;; + ;; -debug ) debug - ;; + ;; -config ) config ;; + -stress0 ) stress0 + ;; -stress1 ) stress1 ;; -stress2 ) stress2 @@ -148,12 +163,13 @@ while [ "$1" != "" ]; do cluster debug config + stress0 stress1 stress2 - ;; + ;; -h | --help ) usage exit - ;; + ;; * ) usage exit 1 esac diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 64f74ae3..04719719 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -148,8 +148,8 @@ module VX_cluster #( VX_mem_arb #( .NUM_REQS (`NUM_CORES), - .DATA_WIDTH (`L2MEM_DATA_WIDTH), - .ADDR_WIDTH (`L2MEM_ADDR_WIDTH), + .DATA_WIDTH (`DMEM_DATA_WIDTH), + .ADDR_WIDTH (`DMEM_ADDR_WIDTH), .TAG_IN_WIDTH (`XMEM_TAG_WIDTH), .TAG_SEL_IDX (1), // Skip 0 for NC flag .BUFFERED_REQ (1), diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index fe1b12c3..dad6a84c 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -247,14 +247,14 @@ // Cache ID `define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0) -// Block size in bytes -`define ICACHE_LINE_SIZE `L1_BLOCK_SIZE +// Number of banks +`define INUM_BANKS 1 // Word size in bytes `define IWORD_SIZE 4 -// Number of banks -`define INUM_BANKS 1 +// Block size in bytes +`define ICACHE_LINE_SIZE `L1_BLOCK_SIZE // Core request address bits `define ICORE_ADDR_WIDTH (32-`CLOG2(`IWORD_SIZE)) @@ -285,12 +285,12 @@ // Cache ID `define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1) -// Block size in bytes -`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE - // Word size in bytes `define DWORD_SIZE 4 +// Block size in bytes +`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE + // Core request address bits `define DCORE_ADDR_WIDTH (32-`CLOG2(`DWORD_SIZE)) @@ -337,12 +337,12 @@ // Cache ID `define L2CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID) -// Block size in bytes -`define L2CACHE_LINE_SIZE `MEM_BLOCK_SIZE - // Word size in bytes `define L2WORD_SIZE `DCACHE_LINE_SIZE +// Block size in bytes +`define L2CACHE_LINE_SIZE (`L2_ENABLE ? `MEM_BLOCK_SIZE : `L2WORD_SIZE) + // Input request tag bits `define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES)) @@ -369,12 +369,12 @@ // Cache ID `define L3CACHE_ID 0 -// Block size in bytes -`define L3CACHE_LINE_SIZE `MEM_BLOCK_SIZE - // Word size in bytes `define L3WORD_SIZE `L2CACHE_LINE_SIZE +// Block size in bytes +`define L3CACHE_LINE_SIZE (`L3_ENABLE ? `MEM_BLOCK_SIZE : `L3WORD_SIZE) + // Input request tag bits `define L3CORE_TAG_WIDTH (`L2CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS))