From 914b680aedafa47154d4cbd796077f00f1b3ea0c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 15 Dec 2023 14:09:51 -0800 Subject: [PATCH 1/4] operands optimization minor updates minor updates minor update operands optimization minor updates minor updates --- hw/rtl/VX_gpu_pkg.sv | 50 +++++---- hw/rtl/core/VX_dispatch_unit.sv | 10 +- hw/rtl/core/VX_gather_unit.sv | 14 +-- hw/rtl/core/VX_lsu_unit.sv | 2 +- hw/rtl/core/VX_operands.sv | 184 +++++++++++++++----------------- hw/rtl/core/VX_scoreboard.sv | 41 +++---- 6 files changed, 143 insertions(+), 158 deletions(-) diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 4ece6c9c..668b53ee 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -190,42 +190,46 @@ package VX_gpu_pkg; /////////////////////////////// Issue parameters ////////////////////////// - localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH); + localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH); + localparam ISSUE_ISW_W = `UP(ISSUE_ISW); localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH; - localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO); - localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO)); - + localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO); + localparam ISSUE_WIS_W = `UP(ISSUE_WIS); + `IGNORE_UNUSED_BEGIN - function logic [ISSUE_IDX_W-1:0] wid_to_isw( + function logic [`NW_WIDTH-1:0] wis_to_wid( + input logic [ISSUE_WIS_W-1:0] wis, + input logic [ISSUE_ISW_W-1:0] isw + ); + if (ISSUE_WIS == 0) begin + wis_to_wid = `NW_WIDTH'(isw); + end else if (ISSUE_ISW == 0) begin + wis_to_wid = `NW_WIDTH'(wis); + end else begin + wis_to_wid = `NW_WIDTH'({wis, isw}); + end + endfunction + + function logic [ISSUE_ISW_W-1:0] wid_to_isw( input logic [`NW_WIDTH-1:0] wid ); - if (`ISSUE_WIDTH > 1) begin - wid_to_isw = ISSUE_IDX_W'(wid); + if (ISSUE_ISW != 0) begin + wid_to_isw = wid[ISSUE_ISW_W-1:0]; end else begin wid_to_isw = 0; end endfunction -`IGNORE_UNUSED_END - - function logic [`NW_WIDTH-1:0] wis_to_wid( - input logic [ISSUE_WIS_W-1:0] wis, - input logic [ISSUE_IDX_W-1:0] isw - ); - wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH))); - endfunction function logic [ISSUE_WIS_W-1:0] wid_to_wis( input logic [`NW_WIDTH-1:0] wid ); - wid_to_wis = ISSUE_WIS_W'({1'b0, wid} >> `CLOG2(`ISSUE_WIDTH)); - endfunction - - function logic [ISSUE_ADDRW-1:0] wis_to_addr( - input logic [`NR_BITS-1:0] rid, - input logic [ISSUE_WIS_W-1:0] wis - ); - wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO))); + if (ISSUE_WIS != 0) begin + wid_to_wis = ISSUE_WIS_W'(wid >> ISSUE_ISW); + end else begin + wid_to_wis = 0; + end endfunction +`IGNORE_UNUSED_END endpackage diff --git a/hw/rtl/core/VX_dispatch_unit.sv b/hw/rtl/core/VX_dispatch_unit.sv index 586acc0b..6e36a33b 100644 --- a/hw/rtl/core/VX_dispatch_unit.sv +++ b/hw/rtl/core/VX_dispatch_unit.sv @@ -203,20 +203,20 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( assign block_done[block_idx] = ~valid_p || ready_p; end - wire [ISSUE_IDX_W-1:0] wsi; + wire [ISSUE_ISW_W-1:0] isw; if (BATCH_COUNT != 1) begin if (BLOCK_SIZE != 1) begin - assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)}; + assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)}; end else begin - assign wsi = batch_idx; + assign isw = batch_idx; end end else begin - assign wsi = block_idx; + assign isw = block_idx; end `RESET_RELAY(buf_out_reset, reset); - wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi); + wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); VX_elastic_buffer #( .DATAW (OUT_DATAW), diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv index e3dc935d..21ae4485 100644 --- a/hw/rtl/core/VX_gather_unit.sv +++ b/hw/rtl/core/VX_gather_unit.sv @@ -37,7 +37,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #( wire [BLOCK_SIZE-1:0] commit_in_valid; wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data; wire [BLOCK_SIZE-1:0] commit_in_ready; - wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi; + wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw; for (genvar i = 0; i < BLOCK_SIZE; ++i) begin assign commit_in_valid[i] = commit_in_if[i].valid; @@ -45,12 +45,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #( assign commit_in_if[i].ready = commit_in_ready[i]; if (BLOCK_SIZE != `ISSUE_WIDTH) begin if (BLOCK_SIZE != 1) begin - assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; + assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; end else begin - assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W]; + assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W]; end end else begin - assign commit_in_wsi[i] = BLOCK_SIZE_W'(i); + assign commit_in_isw[i] = BLOCK_SIZE_W'(i); end end @@ -64,12 +64,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #( commit_out_data[i] = 'x; end for (integer i = 0; i < BLOCK_SIZE; ++i) begin - commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i]; - commit_out_data[commit_in_wsi[i]] = commit_in_data[i]; + commit_out_valid[commit_in_isw[i]] = commit_in_valid[i]; + commit_out_data[commit_in_isw[i]] = commit_in_data[i]; end end for (genvar i = 0; i < BLOCK_SIZE; ++i) begin - assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]]; + assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]]; end for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index 3383f70f..1e0a09b8 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (RSP_ARB_DATAW), - .OUT_REG (2) + .OUT_REG (3) ) rsp_arb ( .clk (clk), .reset (commit_reset), diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 3ff5df46..ee0c493b 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -26,6 +26,7 @@ module VX_operands import VX_gpu_pkg::*; #( ); `UNUSED_PARAM (CORE_ID) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; + localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO); localparam STATE_IDLE = 2'd0; localparam STATE_FETCH1 = 2'd1; @@ -46,9 +47,11 @@ module VX_operands import VX_gpu_pkg::*; #( reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0]; reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n; + reg valid_out_r; + reg [DATAW-1:0] data_out_r; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n; - reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; + reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; reg [STATE_BITS-1:0] state, state_n; reg [`NR_BITS-1:0] rs2, rs2_n; @@ -57,11 +60,11 @@ module VX_operands import VX_gpu_pkg::*; #( reg rs3_ready, rs3_ready_n; reg data_ready, data_ready_n; + wire ready_out = operands_if[i].ready; + wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0); wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0); - wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0); - - VX_operands_if staging_if(); + wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0); always @(*) begin state_n = state; @@ -82,7 +85,7 @@ module VX_operands import VX_gpu_pkg::*; #( case (state) STATE_IDLE: begin - if (staging_if.valid && staging_if.ready) begin + if (valid_out_r && ready_out) begin data_ready_n = 0; end if (scoreboard_if[i].valid && data_ready_n == 0) begin @@ -170,33 +173,86 @@ module VX_operands import VX_gpu_pkg::*; #( end always @(posedge clk) begin - if (reset) begin + if (reset) begin state <= STATE_IDLE; - gpr_rd_rid <= '0; - gpr_rd_wis <= '0; cache_eop <= {ISSUE_RATIO{1'b1}}; data_ready <= 0; + valid_out_r <= 0; end else begin state <= state_n; - rs2 <= rs2_n; - rs3 <= rs3_n; - rs2_ready <= rs2_ready_n; - rs3_ready <= rs3_ready_n; - rs1_data <= rs1_data_n; - rs2_data <= rs2_data_n; - rs3_data <= rs3_data_n; - gpr_rd_rid <= gpr_rd_rid_n; - gpr_rd_wis <= gpr_rd_wis_n; - cache_data <= cache_data_n; - cache_reg <= cache_reg_n; - cache_tmask <= cache_tmask_n; cache_eop <= cache_eop_n; - data_ready <= data_ready_n; + data_ready <= data_ready_n; + if (~valid_out_r) begin + valid_out_r <= scoreboard_if[i].valid && data_ready; + end else if (ready_out) begin + valid_out_r <= 0; + end end - end + + if (~valid_out_r) begin + data_out_r <= {scoreboard_if[i].data.uuid, + scoreboard_if[i].data.wis, + scoreboard_if[i].data.tmask, + scoreboard_if[i].data.PC, + scoreboard_if[i].data.wb, + scoreboard_if[i].data.ex_type, + scoreboard_if[i].data.op_type, + scoreboard_if[i].data.op_mod, + scoreboard_if[i].data.use_PC, + scoreboard_if[i].data.use_imm, + scoreboard_if[i].data.imm, + scoreboard_if[i].data.rd}; + end + + gpr_rd_rid <= gpr_rd_rid_n; + gpr_rd_wis <= gpr_rd_wis_n; + rs2_ready <= rs2_ready_n; + rs3_ready <= rs3_ready_n; + rs2 <= rs2_n; + rs3 <= rs3_n; + rs1_data <= rs1_data_n; + rs2_data <= rs2_data_n; + rs3_data <= rs3_data_n; + cache_data <= cache_data_n; + cache_reg <= cache_reg_n; + cache_tmask <= cache_tmask_n; + end + + assign operands_if[i].valid = valid_out_r; + assign {operands_if[i].data.uuid, + operands_if[i].data.wis, + operands_if[i].data.tmask, + operands_if[i].data.PC, + operands_if[i].data.wb, + operands_if[i].data.ex_type, + operands_if[i].data.op_type, + operands_if[i].data.op_mod, + operands_if[i].data.use_PC, + operands_if[i].data.use_imm, + operands_if[i].data.imm, + operands_if[i].data.rd} = data_out_r; + assign operands_if[i].data.rs1_data = rs1_data; + assign operands_if[i].data.rs2_data = rs2_data; + assign operands_if[i].data.rs3_data = rs3_data; + + assign scoreboard_if[i].ready = ~valid_out_r && data_ready; // GPR banks + reg [RAM_ADDRW-1:0] gpr_rd_addr; + wire [RAM_ADDRW-1:0] gpr_wr_addr; + if (ISSUE_WIS != 0) begin + assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd}; + always @(posedge clk) begin + gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n}; + end + end else begin + assign gpr_wr_addr = writeback_if[i].data.rd; + always @(posedge clk) begin + gpr_rd_addr <= gpr_rd_rid_n; + end + end + `ifdef GPR_RESET reg wr_enabled = 0; always @(posedge clk) begin @@ -204,10 +260,8 @@ module VX_operands import VX_gpu_pkg::*; #( wr_enabled <= 1; end end - `else - wire wr_enabled = 1; `endif - + for (genvar j = 0; j < `NUM_THREADS; ++j) begin VX_dp_ram #( .DATAW (`XLEN), @@ -221,81 +275,17 @@ module VX_operands import VX_gpu_pkg::*; #( .clk (clk), .read (1'b1), `UNUSED_PIN (wren), - .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), - .waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)), + `ifdef GPR_RESET + .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `else + .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `endif + .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), - .raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)), + .raddr (gpr_rd_addr), .rdata (gpr_rd_data[j]) ); end - - // staging buffer - - `RESET_RELAY (stg_buf_reset, reset); - - VX_elastic_buffer #( - .DATAW (DATAW) - ) stg_buf ( - .clk (clk), - .reset (stg_buf_reset), - .valid_in (scoreboard_if[i].valid), - .ready_in (scoreboard_if[i].ready), - .data_in ({ - scoreboard_if[i].data.uuid, - scoreboard_if[i].data.wis, - scoreboard_if[i].data.tmask, - scoreboard_if[i].data.PC, - scoreboard_if[i].data.wb, - scoreboard_if[i].data.ex_type, - scoreboard_if[i].data.op_type, - scoreboard_if[i].data.op_mod, - scoreboard_if[i].data.use_PC, - scoreboard_if[i].data.use_imm, - scoreboard_if[i].data.imm, - scoreboard_if[i].data.rd}), - .data_out ({ - staging_if.data.uuid, - staging_if.data.wis, - staging_if.data.tmask, - staging_if.data.PC, - staging_if.data.wb, - staging_if.data.ex_type, - staging_if.data.op_type, - staging_if.data.op_mod, - staging_if.data.use_PC, - staging_if.data.use_imm, - staging_if.data.imm, - staging_if.data.rd}), - .valid_out (staging_if.valid), - .ready_out (staging_if.ready) - ); - - assign staging_if.data.rs1_data = rs1_data; - assign staging_if.data.rs2_data = rs2_data; - assign staging_if.data.rs3_data = rs3_data; - - // output buffer - - wire valid_stg, ready_stg; - assign valid_stg = staging_if.valid && data_ready; - assign staging_if.ready = ready_stg && data_ready; - - `RESET_RELAY (out_buf_reset, reset); - - VX_elastic_buffer #( - .DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)), - .SIZE (2), - .OUT_REG (2) - ) out_buf ( - .clk (clk), - .reset (out_buf_reset), - .valid_in (valid_stg), - .ready_in (ready_stg), - .data_in (staging_if.data), - .data_out (operands_if[i].data), - .valid_out (operands_if[i].valid), - .ready_out (operands_if[i].ready) - ); - end + end endmodule diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 2206df25..1c5f3676 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -51,7 +51,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #( for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs; - VX_ibuffer_if staging_if(); wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; @@ -84,10 +83,17 @@ module VX_scoreboard import VX_gpu_pkg::*; #( reg [DATAW-1:0] data_out_r; reg valid_out_r; + wire ready_out; wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; wire deps_ready = (& ready_masks); + wire valid_in = ibuffer_if[i].valid && deps_ready; + wire ready_in = ~valid_out_r && deps_ready; + wire [DATAW-1:0] data_in = ibuffer_if[i].data; + + assign ready_out = scoreboard_if[i].ready; + always @(posedge clk) begin if (reset) begin valid_out_r <= 0; @@ -97,40 +103,25 @@ module VX_scoreboard import VX_gpu_pkg::*; #( inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; end if (~valid_out_r) begin - valid_out_r <= ibuffer_if[i].valid && deps_ready; - end else if (staging_if.ready) begin - if (staging_if.data.wb) begin - inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1; + valid_out_r <= valid_in; + end else if (ready_out) begin + if (scoreboard_if[i].data.wb) begin + inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1; `ifdef PERF_ENABLE - inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type; + inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type; `endif end valid_out_r <= 0; end end if (~valid_out_r) begin - data_out_r <= ibuffer_if[i].data; + data_out_r <= data_in; end end - assign ibuffer_if[i].ready = ~valid_out_r && deps_ready; - assign staging_if.valid = valid_out_r; - assign staging_if.data = data_out_r; - - VX_elastic_buffer #( - .DATAW (DATAW), - .SIZE (0), - .OUT_REG (2) - ) out_buf ( - .clk (clk), - .reset (reset), - .valid_in (staging_if.valid), - .ready_in (staging_if.ready), - .data_in (staging_if.data), - .data_out (scoreboard_if[i].data), - .valid_out (scoreboard_if[i].valid), - .ready_out (scoreboard_if[i].ready) - ); + assign ibuffer_if[i].ready = ready_in; + assign scoreboard_if[i].valid = valid_out_r; + assign scoreboard_if[i].data = data_out_r; `ifdef SIMULATION reg [31:0] timeout_ctr; From c7a81d1493b5e0420546b25c9465a64321418d20 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 20 Dec 2023 11:57:44 -0800 Subject: [PATCH 2/4] adding sockets support to simx and cache subsystem refactoring minor update minor update minor updates --- hw/rtl/VX_cluster.sv | 66 +++++++++- hw/rtl/VX_config.vh | 9 +- hw/rtl/VX_define.vh | 18 ++- hw/rtl/VX_gpu_pkg.sv | 9 +- hw/rtl/VX_socket.sv | 45 +------ hw/rtl/VX_types.vh | 44 +++---- hw/rtl/core/VX_core.sv | 18 +-- hw/rtl/core/VX_csr_data.sv | 15 +-- hw/rtl/core/VX_issue.sv | 2 +- hw/rtl/core/VX_lsu_unit.sv | 8 +- hw/rtl/interfaces/VX_pipeline_perf_if.sv | 25 ++-- runtime/common/utils.cpp | 34 +---- runtime/simx/vortex.cpp | 2 +- sim/simx/Makefile | 2 +- sim/simx/arch.h | 10 +- sim/simx/cluster.cpp | 158 ++++++++--------------- sim/simx/cluster.h | 41 +++--- sim/simx/core.cpp | 111 ++++++++-------- sim/simx/core.h | 26 ++-- sim/simx/exe_unit.cpp | 38 +++--- sim/simx/main.cpp | 8 +- sim/simx/socket.cpp | 146 +++++++++++++++++++++ sim/simx/socket.h | 87 +++++++++++++ sim/simx/types.h | 7 + 24 files changed, 541 insertions(+), 388 deletions(-) create mode 100644 sim/simx/socket.cpp create mode 100644 sim/simx/socket.h diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 90076673..6de47c5f 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -85,8 +85,8 @@ module VX_cluster import VX_gpu_pkg::*; #( VX_mem_bus_if #( .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) - ) per_socket_mem_bus_if[`NUM_SOCKETS](); + .TAG_WIDTH (L1_MEM_TAG_WIDTH) + ) l1_mem_bus_if[2](); `RESET_RELAY (l2_reset, reset); @@ -102,7 +102,7 @@ module VX_cluster import VX_gpu_pkg::*; #( .MSHR_SIZE (`L2_MSHR_SIZE), .MRSQ_SIZE (`L2_MRSQ_SIZE), .MREQ_SIZE (`L2_MREQ_SIZE), - .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH), + .TAG_WIDTH (L1_MEM_TAG_WIDTH), .WRITE_ENABLE (1), .UUID_WIDTH (`UUID_WIDTH), .CORE_OUT_REG (2), @@ -115,10 +115,65 @@ module VX_cluster import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE .cache_perf (perf_l2cache), `endif - .core_bus_if (per_socket_mem_bus_if), + .core_bus_if (l1_mem_bus_if), .mem_bus_if (mem_bus_if) ); + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) + ) per_socket_icache_mem_bus_if[`NUM_SOCKETS](); + + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) + ) per_socket_dcache_mem_bus_if[`NUM_SOCKETS](); + + VX_mem_bus_if #( + .DATA_SIZE (ICACHE_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH) + ) icache_mem_bus_if[1](); + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH) + ) dcache_mem_bus_if[1](); + + `RESET_RELAY (l1_mem_arb_reset, reset); + + VX_mem_arb #( + .NUM_INPUTS (`NUM_SOCKETS), + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH), + .TAG_SEL_IDX (1), // Skip 0 for NC flag + .ARBITER ("R"), + .OUT_REG_REQ (2), + .OUT_REG_RSP (2) + ) icache_mem_arb ( + .clk (clk), + .reset (l1_mem_arb_reset), + .bus_in_if (per_socket_icache_mem_bus_if), + .bus_out_if (icache_mem_bus_if) + ); + + VX_mem_arb #( + .NUM_INPUTS (`NUM_SOCKETS), + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH), + .TAG_SEL_IDX (1), // Skip 0 for NC flag + .ARBITER ("R"), + .OUT_REG_REQ (2), + .OUT_REG_RSP (2) + ) dcache_mem_arb ( + .clk (clk), + .reset (l1_mem_arb_reset), + .bus_in_if (per_socket_dcache_mem_bus_if), + .bus_out_if (dcache_mem_bus_if) + ); + + `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH); + `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH); + /////////////////////////////////////////////////////////////////////////// wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak; @@ -155,7 +210,8 @@ module VX_cluster import VX_gpu_pkg::*; #( .dcr_bus_if (socket_dcr_bus_if), - .mem_bus_if (per_socket_mem_bus_if[i]), + .icache_mem_bus_if (per_socket_icache_mem_bus_if[i]), + .dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]), `ifdef GBAR_ENABLE .gbar_bus_if (per_socket_gbar_bus_if[i]), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 3af544c6..d35d906b 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -262,7 +262,10 @@ `endif // LSU Duplicate Address Check -`ifdef LSU_DUP +`ifndef LSU_DUP_DISABLE +`define LSU_DUP_ENABLE +`endif +`ifdef LSU_DUP_ENABLE `define LSU_DUP_ENABLED 1 `else `define LSU_DUP_ENABLED 0 @@ -381,7 +384,7 @@ // Number of Cache Units `ifndef NUM_ICACHES -`define NUM_ICACHES `UP(`NUM_CORES / 4) +`define NUM_ICACHES `UP(`SOCKET_SIZE / 4) `endif // Cache Size @@ -430,7 +433,7 @@ // Number of Cache Units `ifndef NUM_DCACHES -`define NUM_DCACHES `UP(`NUM_CORES / 4) +`define NUM_DCACHES `UP(`SOCKET_SIZE / 4) `endif // Cache Size diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 95d206ce..f39e7fea 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -410,8 +410,22 @@ assign dst = src; \ end -`define TO_DISPATCH_DATA(data, tid) \ - {data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data} +`define TO_DISPATCH_DATA(data, tid) { \ + data.uuid, \ + data.wis, \ + data.tmask, \ + data.op_type, \ + data.op_mod, \ + data.wb, \ + data.use_PC, \ + data.use_imm, \ + data.PC, \ + data.imm, \ + data.rd, \ + tid, \ + data.rs1_data, \ + data.rs2_data, \ + data.rs3_data} /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 668b53ee..b32b9600 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -141,8 +141,9 @@ package VX_gpu_pkg; /////////////////////////////// L1 Parameters ///////////////////////////// - localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2)); + localparam ICACHE_MEM_ARB_TAG_WIDTH = (ICACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS)); + localparam DCACHE_MEM_ARB_TAG_WIDTH = (DCACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS)); + localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_ARB_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH); /////////////////////////////// L2 Parameters ///////////////////////////// @@ -150,10 +151,10 @@ package VX_gpu_pkg; localparam L2_WORD_SIZE = `L1_LINE_SIZE; // Input request size - localparam L2_NUM_REQS = `NUM_SOCKETS; + localparam L2_NUM_REQS = 2; // Core request tag bits - localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH; + localparam L2_TAG_WIDTH = L1_MEM_TAG_WIDTH; // Memory request data bits localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8); diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index 139598d9..74a074d1 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -30,7 +30,8 @@ module VX_socket import VX_gpu_pkg::*; #( VX_dcr_bus_if.slave dcr_bus_if, // Memory - VX_mem_bus_if.master mem_bus_if, + VX_mem_bus_if.master icache_mem_bus_if, + VX_mem_bus_if.master dcache_mem_bus_if, `ifdef GBAR_ENABLE // Barrier @@ -76,47 +77,7 @@ module VX_socket import VX_gpu_pkg::*; #( assign mem_perf_tmp_if.mem = mem_perf_if.mem; `endif - VX_mem_bus_if #( - .DATA_SIZE (ICACHE_LINE_SIZE), - .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) - ) icache_mem_bus_if(); - - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_LINE_SIZE), - .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) - ) dcache_mem_bus_if(); - - VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_TAG_WIDTH) - ) cache_mem_bus_if[2](); - - VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) - ) mem_bus_tmp_if[1](); - - `ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH); - `ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - - `RESET_RELAY (mem_arb_reset, reset); - - VX_mem_arb #( - .NUM_INPUTS (2), - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_TAG_WIDTH), - .TAG_SEL_IDX (1), // Skip 0 for NC flag - .ARBITER ("R"), - .OUT_REG_REQ (2), - .OUT_REG_RSP (2) - ) mem_arb ( - .clk (clk), - .reset (mem_arb_reset), - .bus_in_if (cache_mem_bus_if), - .bus_out_if (mem_bus_tmp_if) - ); - - `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]); + /////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 4fb03783..a5044ccf 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -78,33 +78,25 @@ `define VX_CSR_MPM_IBUF_ST_H 12'hB85 `define VX_CSR_MPM_SCRB_ST 12'hB06 `define VX_CSR_MPM_SCRB_ST_H 12'hB86 -`define VX_CSR_MPM_ALU_ST 12'hB07 -`define VX_CSR_MPM_ALU_ST_H 12'hB87 -`define VX_CSR_MPM_LSU_ST 12'hB08 -`define VX_CSR_MPM_LSU_ST_H 12'hB88 -`define VX_CSR_MPM_FPU_ST 12'hB09 -`define VX_CSR_MPM_FPU_ST_H 12'hB89 -`define VX_CSR_MPM_SFU_ST 12'hB0A -`define VX_CSR_MPM_SFU_ST_H 12'hB8A -`define VX_CSR_MPM_SCRB_ALU 12'hB0B -`define VX_CSR_MPM_SCRB_ALU_H 12'hB8B -`define VX_CSR_MPM_SCRB_FPU 12'hB0C -`define VX_CSR_MPM_SCRB_FPU_H 12'hB8C -`define VX_CSR_MPM_SCRB_LSU 12'hB0D -`define VX_CSR_MPM_SCRB_LSU_H 12'hB8D -`define VX_CSR_MPM_SCRB_SFU 12'hB0E -`define VX_CSR_MPM_SCRB_SFU_H 12'hB8E +`define VX_CSR_MPM_SCRB_ALU 12'hB07 +`define VX_CSR_MPM_SCRB_ALU_H 12'hB87 +`define VX_CSR_MPM_SCRB_FPU 12'hB08 +`define VX_CSR_MPM_SCRB_FPU_H 12'hB88 +`define VX_CSR_MPM_SCRB_LSU 12'hB09 +`define VX_CSR_MPM_SCRB_LSU_H 12'hB89 +`define VX_CSR_MPM_SCRB_SFU 12'hB0A +`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A // PERF: memory -`define VX_CSR_MPM_IFETCHES 12'hB0F -`define VX_CSR_MPM_IFETCHES_H 12'hB8F -`define VX_CSR_MPM_LOADS 12'hB10 -`define VX_CSR_MPM_LOADS_H 12'hB90 -`define VX_CSR_MPM_STORES 12'hB11 -`define VX_CSR_MPM_STORES_H 12'hB91 -`define VX_CSR_MPM_IFETCH_LT 12'hB12 -`define VX_CSR_MPM_IFETCH_LT_H 12'hB92 -`define VX_CSR_MPM_LOAD_LT 12'hB13 -`define VX_CSR_MPM_LOAD_LT_H 12'hB93 +`define VX_CSR_MPM_IFETCHES 12'hB0B +`define VX_CSR_MPM_IFETCHES_H 12'hB8B +`define VX_CSR_MPM_LOADS 12'hB0C +`define VX_CSR_MPM_LOADS_H 12'hB8C +`define VX_CSR_MPM_STORES 12'hB0D +`define VX_CSR_MPM_STORES_H 12'hB8D +`define VX_CSR_MPM_IFETCH_LT 12'hB0E +`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E +`define VX_CSR_MPM_LOAD_LT 12'hB0F +`define VX_CSR_MPM_LOAD_LT_H 12'hB8F // Machine Performance-monitoring memory counters // PERF: icache diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 5aba3075..4d3ce297 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -273,23 +273,23 @@ module VX_core import VX_gpu_pkg::*; #( wire [1:0] perf_icache_pending_read_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle; - reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; - reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; + reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; + reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; - reg [`PERF_CTR_BITS-1:0] perf_ifetches; - reg [`PERF_CTR_BITS-1:0] perf_loads; - reg [`PERF_CTR_BITS-1:0] perf_stores; + reg [`PERF_CTR_BITS-1:0] perf_ifetches; + reg [`PERF_CTR_BITS-1:0] perf_loads; + reg [`PERF_CTR_BITS-1:0] perf_stores; - wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready; - wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready; + wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready; + wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r; wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r; wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire; for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin - assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; - assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; + assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw; + assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw; assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; end diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 6d7c41f8..1b370260 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -195,19 +195,6 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; `VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0]; - `VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0]; - `VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]); - `ifdef EXT_F_ENABLE - `VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0]; - `VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]); - `else - `VX_CSR_MPM_FPU_ST : read_data_ro_r = '0; - `VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0; - `endif - `VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0]; - `VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0]; `ifdef EXT_F_ENABLE @@ -220,7 +207,7 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0]; `VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0]; + `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0]; // PERF: memory `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 8d0eaff6..912abc97 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -84,7 +84,7 @@ module VX_issue #( .clk (clk), .reset (dispatch_reset), `ifdef PERF_ENABLE - .perf_stalls (perf_issue_if.dsp_stalls), + `UNUSED_PIN (perf_stalls), `endif .operands_if (operands_if), .alu_dispatch_if(alu_dispatch_if), diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index 1e0a09b8..5a57db4c 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( // detect duplicate addresses wire lsu_is_dup; -`ifdef LSU_DUP +`ifdef LSU_DUP_ENABLE if (NUM_LANES > 1) begin wire [NUM_LANES-2:0] addr_matches; for (genvar i = 0; i < (NUM_LANES-1); ++i) begin @@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( assign mem_req_tag = { execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr - `ifdef LSU_DUP + `ifdef LSU_DUP_ENABLE , lsu_is_dup `endif }; @@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( wire [PID_WIDTH-1:0] rsp_pid; wire rsp_is_dup; -`ifndef LSU_DUP +`ifndef LSU_DUP_ENABLE assign rsp_is_dup = 0; `endif assign { rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr - `ifdef LSU_DUP + `ifdef LSU_DUP_ENABLE , rsp_is_dup `endif } = mem_rsp_tag; diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 66225336..2ae0f678 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -14,18 +14,17 @@ `include "VX_define.vh" interface VX_pipeline_perf_if (); - wire [`PERF_CTR_BITS-1:0] sched_idles; - wire [`PERF_CTR_BITS-1:0] sched_stalls; - wire [`PERF_CTR_BITS-1:0] ibf_stalls; - wire [`PERF_CTR_BITS-1:0] scb_stalls; - wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; - wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS]; + wire [`PERF_CTR_BITS-1:0] sched_idles; + wire [`PERF_CTR_BITS-1:0] sched_stalls; + wire [`PERF_CTR_BITS-1:0] ibf_stalls; + wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; - wire [`PERF_CTR_BITS-1:0] ifetches; - wire [`PERF_CTR_BITS-1:0] loads; - wire [`PERF_CTR_BITS-1:0] stores; - wire [`PERF_CTR_BITS-1:0] ifetch_latency; - wire [`PERF_CTR_BITS-1:0] load_latency; + wire [`PERF_CTR_BITS-1:0] ifetches; + wire [`PERF_CTR_BITS-1:0] loads; + wire [`PERF_CTR_BITS-1:0] stores; + wire [`PERF_CTR_BITS-1:0] ifetch_latency; + wire [`PERF_CTR_BITS-1:0] load_latency; modport schedule ( output sched_idles, @@ -35,8 +34,7 @@ interface VX_pipeline_perf_if (); modport issue ( output ibf_stalls, output scb_stalls, - output scb_uses, - output dsp_stalls + output scb_uses ); modport slave ( @@ -45,7 +43,6 @@ interface VX_pipeline_perf_if (); input ibf_stalls, input scb_stalls, input scb_uses, - input dsp_stalls, input ifetches, input loads, input stores, diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp index c0199a86..5f472c84 100644 --- a/runtime/common/utils.cpp +++ b/runtime/common/utils.cpp @@ -204,10 +204,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t sched_stalls = 0; uint64_t ibuffer_stalls = 0; uint64_t scrb_stalls = 0; - uint64_t lsu_stalls = 0; - uint64_t fpu_stalls = 0; - uint64_t alu_stalls = 0; - uint64_t sfu_stalls = 0; uint64_t scrb_alu = 0; uint64_t scrb_fpu = 0; uint64_t scrb_lsu = 0; @@ -310,34 +306,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { calcAvgPercent(scrb_sfu_per_core, scrb_total)); scrb_stalls += scrb_stalls_per_core; } - // alu_stalls - { - uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core); - alu_stalls += alu_stalls_per_core; - } - // lsu_stalls - { - uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core); - lsu_stalls += lsu_stalls_per_core; - } - // fpu_stalls - { - uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core); - fpu_stalls += fpu_stalls_per_core; - } - // sfu_stalls - { - uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core); - sfu_stalls += sfu_stalls_per_core; - } // PERF: memory // ifetches { - uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); + uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCHES); if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); ifetches += ifetches_per_core; @@ -464,10 +436,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { calcAvgPercent(scrb_fpu, scrb_total), calcAvgPercent(scrb_lsu, scrb_total), calcAvgPercent(scrb_sfu, scrb_total)); - fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); - fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls); - fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); - fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls); fprintf(stream, "PERF: ifetches=%ld\n", ifetches); fprintf(stream, "PERF: loads=%ld\n", loads); fprintf(stream, "PERF: stores=%ld\n", stores); diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 3b4cb171..b7b9cdcb 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -87,7 +87,7 @@ private: class vx_device { public: vx_device() - : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS) + : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES) , ram_(RAM_PAGE_SIZE) , processor_(arch_) , global_mem_( diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 42823205..bb67dbb5 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -15,7 +15,7 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp +SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp # Debugigng ifdef DEBUG diff --git a/sim/simx/arch.h b/sim/simx/arch.h index ab6ac4a3..099fbedd 100644 --- a/sim/simx/arch.h +++ b/sim/simx/arch.h @@ -28,6 +28,7 @@ private: uint16_t num_warps_; uint16_t num_cores_; uint16_t num_clusters_; + uint16_t socket_size_; uint16_t vsize_; uint16_t num_regs_; uint16_t num_csrs_; @@ -35,11 +36,12 @@ private: uint16_t ipdom_size_; public: - Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters) + Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores) : num_threads_(num_threads) , num_warps_(num_warps) , num_cores_(num_cores) - , num_clusters_(num_clusters) + , num_clusters_(NUM_CLUSTERS) + , socket_size_(SOCKET_SIZE) , vsize_(16) , num_regs_(32) , num_csrs_(4096) @@ -82,6 +84,10 @@ public: uint16_t num_clusters() const { return num_clusters_; } + + uint16_t socket_size() const { + return socket_size_; + } }; } \ No newline at end of file diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index d7104915..7f690fb6 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -24,14 +24,38 @@ Cluster::Cluster(const SimContext& ctx, , mem_req_port(this) , mem_rsp_port(this) , cluster_id_(cluster_id) - , cores_(arch.num_cores()) + , sockets_(NUM_SOCKETS) , barriers_(arch.num_barriers(), 0) - , sharedmems_(arch.num_cores()) , processor_(processor) + , cores_per_socket_(arch.socket_size()) { - auto num_cores = arch.num_cores(); - char sname[100]; + + auto sockets_per_cluster = sockets_.size(); + + // create sockets + + snprintf(sname, 100, "cluster%d-icache-arb", cluster_id); + auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster); + + snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id); + auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster); + + for (uint32_t i = 0; i < sockets_per_cluster; ++i) { + uint32_t socket_id = cluster_id * sockets_per_cluster + i; + auto socket = Socket::Create(socket_id, this, arch, dcrs); + + socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i)); + icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port); + + socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i)); + dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port); + + sockets_.at(i) = socket; + } + + // Create l2cache + snprintf(sname, 100, "cluster%d-l2cache", cluster_id); l2cache_ = CacheSim::Create(sname, CacheSim::Config{ !L2_ENABLED, @@ -42,7 +66,7 @@ Cluster::Cluster(const SimContext& ctx, log2ceil(L2_NUM_BANKS), // B XLEN, // address bits 1, // number of ports - 5, // request size + 2, // request size true, // write-through false, // write response L2_MSHR_SIZE, // mshr @@ -52,87 +76,11 @@ Cluster::Cluster(const SimContext& ctx, l2cache_->MemReqPort.bind(&this->mem_req_port); this->mem_rsp_port.bind(&l2cache_->MemRspPort); - snprintf(sname, 100, "cluster%d-icaches", cluster_id); - icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{ - !ICACHE_ENABLED, - log2ceil(ICACHE_SIZE), // C - log2ceil(L1_LINE_SIZE), // L - log2ceil(sizeof(uint32_t)), // W - log2ceil(ICACHE_NUM_WAYS),// A - 1, // B - XLEN, // address bits - 1, // number of ports - 1, // number of inputs - true, // write-through - false, // write response - (uint8_t)arch.num_warps(), // mshr - 2, // pipeline latency - }); + icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0)); + l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0)); - icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0)); - l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort); - - snprintf(sname, 100, "cluster%d-dcaches", cluster_id); - dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{ - !DCACHE_ENABLED, - log2ceil(DCACHE_SIZE), // C - log2ceil(L1_LINE_SIZE), // L - log2ceil(sizeof(Word)), // W - log2ceil(DCACHE_NUM_WAYS),// A - log2ceil(DCACHE_NUM_BANKS), // B - XLEN, // address bits - 1, // number of ports - DCACHE_NUM_BANKS, // number of inputs - true, // write-through - false, // write response - DCACHE_MSHR_SIZE, // mshr - 4, // pipeline latency - }); - - dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1)); - l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort); - - /////////////////////////////////////////////////////////////////////////// - - // create shared memory blocks - for (uint32_t i = 0; i < num_cores; ++i) { - snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i); - sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{ - (1 << SMEM_LOG_SIZE), - sizeof(Word), - NUM_LSU_LANES, - NUM_LSU_LANES, - false - }); - } - - // create cores - - for (uint32_t i = 0; i < num_cores; ++i) { - uint32_t core_id = cluster_id * num_cores + i; - cores_.at(i) = Core::Create(core_id, - this, - arch, - dcrs, - sharedmems_.at(i)); - - cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0)); - icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0)); - - for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) { - snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j); - auto smem_demux = SMemDemux::Create(sname); - - cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn); - smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j)); - - smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j)); - dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC); - - smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j)); - sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM); - } - } + dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1)); + l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0)); } Cluster::~Cluster() { @@ -150,14 +98,14 @@ void Cluster::tick() { } void Cluster::attach_ram(RAM* ram) { - for (auto core : cores_) { - core->attach_ram(ram); + for (auto& socket : sockets_) { + socket->attach_ram(ram); } } bool Cluster::running() const { - for (auto& core : cores_) { - if (core->running()) + for (auto& socket : sockets_) { + if (socket->running()) return true; } return false; @@ -166,9 +114,9 @@ bool Cluster::running() const { bool Cluster::check_exit(Word* exitcode, bool riscv_test) const { bool done = true; Word exitcode_ = 0; - for (auto& core : cores_) { + for (auto& socket : sockets_) { Word ec; - if (core->check_exit(&ec, riscv_test)) { + if (socket->check_exit(&ec, riscv_test)) { exitcode_ |= ec; } else { done = false; @@ -181,36 +129,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const { void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) { auto& barrier = barriers_.at(bar_id); - uint32_t local_core_id = core_id % cores_.size(); + auto sockets_per_cluster = sockets_.size(); + auto cores_per_socket = cores_per_socket_; + + uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket; + uint32_t local_core_id = core_id % cores_per_cluster; barrier.set(local_core_id); DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id); if (barrier.count() == (size_t)count) { // resume all suspended cores - for (uint32_t i = 0; i < cores_.size(); ++i) { - if (barrier.test(i)) { - DP(3, "*** Resume core #" << i << " at barrier #" << bar_id); - cores_.at(i)->resume(); + for (uint32_t s = 0; s < sockets_per_cluster; ++s) { + for (uint32_t c = 0; c < cores_per_socket; ++c) { + uint32_t i = s * cores_per_socket + c; + if (barrier.test(i)) { + DP(3, "*** Resume core #" << i << " at barrier #" << bar_id); + sockets_.at(s)->resume(c); + } } } barrier.reset(); } } -ProcessorImpl* Cluster::processor() const { - return processor_; -} - Cluster::PerfStats Cluster::perf_stats() const { Cluster::PerfStats perf; - perf.icache = icaches_->perf_stats(); - perf.dcache = dcaches_->perf_stats(); perf.l2cache = l2cache_->perf_stats(); - - for (auto sharedmem : sharedmems_) { - perf.sharedmem += sharedmem->perf_stats(); - } - return perf; } \ No newline at end of file diff --git a/sim/simx/cluster.h b/sim/simx/cluster.h index f91241e9..2547486d 100644 --- a/sim/simx/cluster.h +++ b/sim/simx/cluster.h @@ -17,8 +17,8 @@ #include "dcrs.h" #include "arch.h" #include "cache_cluster.h" -#include "shared_mem.h" #include "core.h" +#include "socket.h" #include "constants.h" namespace vortex { @@ -27,17 +27,11 @@ class ProcessorImpl; class Cluster : public SimObject { public: - struct PerfStats { - CacheSim::PerfStats icache; - CacheSim::PerfStats dcache; - SharedMem::PerfStats sharedmem; - CacheSim::PerfStats l2cache; + struct PerfStats { + CacheSim::PerfStats l2cache; PerfStats& operator+=(const PerfStats& rhs) { - this->icache += rhs.icache; - this->dcache += rhs.dcache; - this->sharedmem += rhs.sharedmem; - this->l2cache += rhs.l2cache; + this->l2cache += rhs.l2cache; return *this; } }; @@ -53,6 +47,14 @@ public: ~Cluster(); + uint32_t id() const { + return cluster_id_; + } + + ProcessorImpl* processor() const { + return processor_; + } + void reset(); void tick(); @@ -65,22 +67,15 @@ public: void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); - ProcessorImpl* processor() const; - Cluster::PerfStats perf_stats() const; private: - uint32_t cluster_id_; - std::vector cores_; - std::vector barriers_; - CacheSim::Ptr l2cache_; - CacheCluster::Ptr icaches_; - CacheCluster::Ptr dcaches_; - std::vector sharedmems_; - CacheCluster::Ptr tcaches_; - CacheCluster::Ptr ocaches_; - CacheCluster::Ptr rcaches_; - ProcessorImpl* processor_; + uint32_t cluster_id_; + std::vector sockets_; + std::vector barriers_; + CacheSim::Ptr l2cache_; + ProcessorImpl* processor_; + uint32_t cores_per_socket_; }; } // namespace vortex \ No newline at end of file diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 49c2ec35..7a549ebd 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -21,18 +21,14 @@ #include "mem.h" #include "decode.h" #include "core.h" +#include "socket.h" #include "debug.h" #include "constants.h" #include "processor_impl.h" using namespace vortex; -Core::Core(const SimContext& ctx, - uint32_t core_id, - Cluster* cluster, - const Arch &arch, - const DCRS &dcrs, - SharedMem::Ptr sharedmem) +Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs) : SimObject(ctx, "core") , icache_req_ports(1, this) , icache_rsp_ports(1, this) @@ -50,12 +46,12 @@ Core::Core(const SimContext& ctx, , operands_(ISSUE_WIDTH) , dispatchers_((uint32_t)ExeType::ExeTypeCount) , exe_units_((uint32_t)ExeType::ExeTypeCount) - , sharedmem_(sharedmem) + , smem_demuxs_(NUM_LSU_LANES) , fetch_latch_("fetch") , decode_latch_("decode") , pending_icache_(arch_.num_warps()) , csrs_(arch.num_warps()) - , cluster_(cluster) + , socket_(socket) , commit_arbs_(ISSUE_WIDTH) { char sname[100]; @@ -72,6 +68,27 @@ Core::Core(const SimContext& ctx, operands_.at(i) = SimPlatform::instance().create_object(); } + // initialize shared memory + shared_mem_ = SharedMem::Create(sname, SharedMem::Config{ + (1 << SMEM_LOG_SIZE), + sizeof(Word), + NUM_LSU_LANES, + NUM_LSU_LANES, + false + }); + for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) { + snprintf(sname, 100, "smem_demux%d_%d", core_id, i); + auto smem_demux = SMemDemux::Create(sname); + + smem_demux->ReqDC.bind(&dcache_req_ports.at(i)); + dcache_rsp_ports.at(i).bind(&smem_demux->RspDC); + + smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i)); + shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM); + + smem_demuxs_.at(i) = smem_demux; + } + // initialize dispatchers dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES); dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES); @@ -241,13 +258,6 @@ void Core::decode() { stalled_warps_.reset(trace->wid); } - // update perf counters - uint32_t active_threads = trace->tmask.count(); - if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD) - perf_stats_.loads += active_threads; - if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE) - perf_stats_.stores += active_threads; - DT(3, "pipeline-decode: " << *trace); // insert to ibuffer @@ -394,7 +404,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) { if (is_global) { // global barrier handling if (barrier.count() == active_warps_.count()) { - cluster_->barrier(bar_idx, count, core_id_); + socket_->barrier(bar_idx, count, core_id_); barrier.reset(); } } else { @@ -431,7 +441,7 @@ AddrType Core::get_addr_type(uint64_t addr) { void Core::dcache_read(void *data, uint64_t addr, uint32_t size) { auto type = this->get_addr_type(addr); if (type == AddrType::Shared) { - sharedmem_->read(data, addr, size); + shared_mem_->read(data, addr, size); } else { mmu_.read(data, addr, size, 0); } @@ -446,7 +456,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) { this->writeToStdOut(data, addr, size); } else { if (type == AddrType::Shared) { - sharedmem_->write(data, addr, size); + shared_mem_->write(data, addr, size); } else { mmu_.write(data, addr, size, 0); } @@ -554,16 +564,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32; case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff; case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32; - case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff; - case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32; - case VX_CSR_MPM_ALU_ST: return perf_stats_.alu_stalls & 0xffffffff; - case VX_CSR_MPM_ALU_ST_H: return perf_stats_.alu_stalls >> 32; - case VX_CSR_MPM_LSU_ST: return perf_stats_.lsu_stalls & 0xffffffff; - case VX_CSR_MPM_LSU_ST_H: return perf_stats_.lsu_stalls >> 32; - case VX_CSR_MPM_FPU_ST: return perf_stats_.fpu_stalls & 0xffffffff; - case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32; - case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff; - case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32; + case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff; + case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32; case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff; case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32; case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff; @@ -572,7 +574,6 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32; case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff; case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32; - case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff; case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff; @@ -586,27 +587,29 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { } } break; case VX_DCR_MPM_CLASS_MEM: { - auto proc_perf = cluster_->processor()->perf_stats(); + auto proc_perf = socket_->cluster()->processor()->perf_stats(); + auto socket_perf = socket_->perf_stats(); + auto smem_perf = shared_mem_->perf_stats(); switch (addr) { - case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff; - case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32; - case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff; - case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32; - case VX_CSR_MPM_ICACHE_MSHR_ST: return proc_perf.clusters.icache.mshr_stalls & 0xffffffff; - case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32; + case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff; + case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32; + case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff; + case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32; + case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32; - case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff; - case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32; - case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff; - case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32; - case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff; - case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32; - case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff; - case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32; - case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; - case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32; - case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; - case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32; + case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff; + case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32; + case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff; + case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32; + case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff; + case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32; + case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff; + case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32; + case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff; + case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32; + case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32; case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff; case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32; @@ -641,12 +644,12 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff; case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32; - case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff; - case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32; - case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff; - case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32; - case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; - case VX_CSR_MPM_SMEM_BANK_ST_H: return proc_perf.clusters.sharedmem.bank_stalls >> 32; + case VX_CSR_MPM_SMEM_READS: return smem_perf.reads & 0xffffffff; + case VX_CSR_MPM_SMEM_READS_H: return smem_perf.reads >> 32; + case VX_CSR_MPM_SMEM_WRITES: return smem_perf.writes & 0xffffffff; + case VX_CSR_MPM_SMEM_WRITES_H: return smem_perf.writes >> 32; + case VX_CSR_MPM_SMEM_BANK_ST: return smem_perf.bank_stalls & 0xffffffff; + case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32; } } break; } diff --git a/sim/simx/core.h b/sim/simx/core.h index cef60e81..343fdb31 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -40,7 +40,7 @@ namespace vortex { -class Cluster; +class Socket; using TraceSwitch = Mux; @@ -53,10 +53,6 @@ public: uint64_t sched_stalls; uint64_t ibuf_stalls; uint64_t scrb_stalls; - uint64_t alu_stalls; - uint64_t lsu_stalls; - uint64_t fpu_stalls; - uint64_t sfu_stalls; uint64_t scrb_alu; uint64_t scrb_fpu; uint64_t scrb_lsu; @@ -74,10 +70,6 @@ public: , sched_stalls(0) , ibuf_stalls(0) , scrb_stalls(0) - , alu_stalls(0) - , lsu_stalls(0) - , fpu_stalls(0) - , sfu_stalls(0) , scrb_alu(0) , scrb_fpu(0) , scrb_lsu(0) @@ -96,12 +88,7 @@ public: std::vector> dcache_req_ports; std::vector> dcache_rsp_ports; - Core(const SimContext& ctx, - uint32_t core_id, - Cluster* cluster, - const Arch &arch, - const DCRS &dcrs, - SharedMem::Ptr sharedmem); + Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs); ~Core(); @@ -119,6 +106,10 @@ public: return core_id_; } + Socket* socket() const { + return socket_; + } + const Arch& arch() const { return arch_; } @@ -181,7 +172,8 @@ private: std::vector operands_; std::vector dispatchers_; std::vector exe_units_; - SharedMem::Ptr sharedmem_; + SharedMem::Ptr shared_mem_; + std::vector smem_demuxs_; PipelineLatch fetch_latch_; PipelineLatch decode_latch_; @@ -201,7 +193,7 @@ private: PerfStats perf_stats_; - Cluster* cluster_; + Socket* socket_; std::vector commit_arbs_; diff --git a/sim/simx/exe_unit.cpp b/sim/simx/exe_unit.cpp index 2f3e79e3..4b5cb356 100644 --- a/sim/simx/exe_unit.cpp +++ b/sim/simx/exe_unit.cpp @@ -51,8 +51,7 @@ void AluUnit::tick() { assert(core_->stalled_warps_.test(trace->wid)); core_->stalled_warps_.reset(trace->wid); } - auto time = input.pop(); - core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time); + input.pop(); } } @@ -87,8 +86,7 @@ void FpuUnit::tick() { std::abort(); } DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace); - auto time = input.pop(); - core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time); + input.pop(); } } @@ -114,7 +112,7 @@ void LsuUnit::tick() { // handle dcache response for (uint32_t t = 0; t < num_lanes_; ++t) { - auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t); + auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn; if (dcache_rsp_port.empty()) continue; auto& mem_rsp = dcache_rsp_port.front(); @@ -136,7 +134,7 @@ void LsuUnit::tick() { // handle shared memory response for (uint32_t t = 0; t < num_lanes_; ++t) { - auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t); + auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t); if (smem_rsp_port.empty()) continue; auto& mem_rsp = smem_rsp_port.front(); @@ -184,8 +182,7 @@ void LsuUnit::tick() { fence_lock_ = true; DT(3, "fence-lock: " << *trace); // remove input - auto time = input.pop(); - core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); + input.pop(); break; } @@ -213,7 +210,9 @@ void LsuUnit::tick() { auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask; matches += (addr0 == mem_addr); } + #ifdef LSU_DUP_ENABLE is_dup = (matches == trace->tmask.count()); + #endif } uint32_t addr_count; @@ -229,7 +228,7 @@ void LsuUnit::tick() { if (!trace->tmask.test(t0 + t)) continue; - auto& dcache_req_port = core_->dcache_req_ports.at(t); + auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn; auto mem_addr = trace_data->mem_addrs.at(t); auto type = core_->get_addr_type(mem_addr.addr); @@ -241,12 +240,16 @@ void LsuUnit::tick() { mem_req.cid = trace->cid; mem_req.uuid = trace->uuid; - dcache_req_port.send(mem_req, 2); + dcache_req_port.send(mem_req, 1); DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace); - ++pending_loads_; - ++core_->perf_stats_.loads; + if (is_write) { + ++core_->perf_stats_.stores; + } else { + ++core_->perf_stats_.loads; + ++pending_loads_; + } if (is_dup) break; } @@ -254,13 +257,11 @@ void LsuUnit::tick() { // do not wait on writes if (is_write) { pending_rd_reqs_.release(tag); - output.send(trace, 1); - ++core_->perf_stats_.stores; + output.send(trace, 1); } // remove input - auto time = input.pop(); - core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); + input.pop(); break; // single block } @@ -318,10 +319,7 @@ void SfuUnit::tick() { core_->stalled_warps_.reset(trace->wid); } - auto time = input.pop(); - auto stalls = (SimPlatform::instance().cycles() - time); - - core_->perf_stats_.sfu_stalls += stalls; + input.pop(); break; // single block } diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index 22d9c880..64031bb8 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -34,14 +34,13 @@ static void show_usage() { uint32_t num_threads = NUM_THREADS; uint32_t num_warps = NUM_WARPS; uint32_t num_cores = NUM_CORES; -uint32_t num_clusters = NUM_CLUSTERS; bool showStats = false;; bool riscv_test = false; const char* program = nullptr; static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) { + while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) { switch (c) { case 't': num_threads = atoi(optarg); @@ -51,9 +50,6 @@ static void parse_args(int argc, char **argv) { break; case 'c': num_cores = atoi(optarg); - break; - case 'g': - num_clusters = atoi(optarg); break; case 'r': riscv_test = true; @@ -88,7 +84,7 @@ int main(int argc, char **argv) { { // create processor configuation - Arch arch(num_threads, num_warps, num_cores, num_clusters); + Arch arch(num_threads, num_warps, num_cores); // create memory module RAM ram(RAM_PAGE_SIZE); diff --git a/sim/simx/socket.cpp b/sim/simx/socket.cpp new file mode 100644 index 00000000..fb620d62 --- /dev/null +++ b/sim/simx/socket.cpp @@ -0,0 +1,146 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "socket.h" +#include "cluster.h" + +using namespace vortex; + +Socket::Socket(const SimContext& ctx, + uint32_t socket_id, + Cluster* cluster, + const Arch &arch, const + DCRS &dcrs) + : SimObject(ctx, "socket") + , icache_mem_req_port(this) + , icache_mem_rsp_port(this) + , dcache_mem_req_port(this) + , dcache_mem_rsp_port(this) + , socket_id_(socket_id) + , cores_(arch.socket_size()) + , cluster_(cluster) +{ + auto cores_per_socket = cores_.size(); + + char sname[100]; + snprintf(sname, 100, "socket%d-icaches", socket_id); + icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{ + !ICACHE_ENABLED, + log2ceil(ICACHE_SIZE), // C + log2ceil(L1_LINE_SIZE), // L + log2ceil(sizeof(uint32_t)), // W + log2ceil(ICACHE_NUM_WAYS),// A + 1, // B + XLEN, // address bits + 1, // number of ports + 1, // number of inputs + true, // write-through + false, // write response + (uint8_t)arch.num_warps(), // mshr + 2, // pipeline latency + }); + + icaches_->MemReqPort.bind(&icache_mem_req_port); + icache_mem_rsp_port.bind(&icaches_->MemRspPort); + + snprintf(sname, 100, "socket%d-dcaches", socket_id); + dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{ + !DCACHE_ENABLED, + log2ceil(DCACHE_SIZE), // C + log2ceil(L1_LINE_SIZE), // L + log2ceil(sizeof(Word)), // W + log2ceil(DCACHE_NUM_WAYS),// A + log2ceil(DCACHE_NUM_BANKS), // B + XLEN, // address bits + 1, // number of ports + DCACHE_NUM_BANKS, // number of inputs + true, // write-through + false, // write response + DCACHE_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + + dcaches_->MemReqPort.bind(&dcache_mem_req_port); + dcache_mem_rsp_port.bind(&dcaches_->MemRspPort); + + // create cores + + for (uint32_t i = 0; i < cores_per_socket; ++i) { + uint32_t core_id = socket_id * cores_per_socket + i; + cores_.at(i) = Core::Create(core_id, this, arch, dcrs); + + cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0)); + icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0)); + + for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) { + cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j)); + dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j)); + } + } +} + +Socket::~Socket() { + //-- +} + +void Socket::reset() { + //-- +} + +void Socket::tick() { + //-- +} + +void Socket::attach_ram(RAM* ram) { + for (auto core : cores_) { + core->attach_ram(ram); + } +} + +bool Socket::running() const { + for (auto& core : cores_) { + if (core->running()) + return true; + } + return false; +} + +bool Socket::check_exit(Word* exitcode, bool riscv_test) const { + bool done = true; + Word exitcode_ = 0; + for (auto& core : cores_) { + Word ec; + if (core->check_exit(&ec, riscv_test)) { + exitcode_ |= ec; + } else { + done = false; + } + } + *exitcode = exitcode_; + return done; +} + +void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) { + cluster_->barrier(bar_id, count, socket_id_ * cores_.size() + core_id); +} + +void Socket::resume(uint32_t core_index) { + cores_.at(core_index)->resume(); +} + +Socket::PerfStats Socket::perf_stats() const { + Socket::PerfStats perf; + perf.icache = icaches_->perf_stats(); + perf.dcache = dcaches_->perf_stats(); + return perf; +} \ No newline at end of file diff --git a/sim/simx/socket.h b/sim/simx/socket.h new file mode 100644 index 00000000..5c94c31f --- /dev/null +++ b/sim/simx/socket.h @@ -0,0 +1,87 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "dcrs.h" +#include "arch.h" +#include "cache_cluster.h" +#include "shared_mem.h" +#include "core.h" +#include "constants.h" + +namespace vortex { + +class Cluster; + +class Socket : public SimObject { +public: + struct PerfStats { + CacheSim::PerfStats icache; + CacheSim::PerfStats dcache; + + PerfStats& operator+=(const PerfStats& rhs) { + this->icache += rhs.icache; + this->dcache += rhs.dcache; + return *this; + } + }; + + SimPort icache_mem_req_port; + SimPort icache_mem_rsp_port; + + SimPort dcache_mem_req_port; + SimPort dcache_mem_rsp_port; + + Socket(const SimContext& ctx, + uint32_t socket_id, + Cluster* cluster, + const Arch &arch, + const DCRS &dcrs); + + ~Socket(); + + uint32_t id() const { + return socket_id_; + } + + Cluster* cluster() const { + return cluster_; + } + + void reset(); + + void tick(); + + void attach_ram(RAM* ram); + + bool running() const; + + bool check_exit(Word* exitcode, bool riscv_test) const; + + void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); + + void resume(uint32_t core_id); + + Socket::PerfStats perf_stats() const; + +private: + uint32_t socket_id_; + std::vector cores_; + CacheCluster::Ptr icaches_; + CacheCluster::Ptr dcaches_; + Cluster* cluster_; +}; + +} // namespace vortex \ No newline at end of file diff --git a/sim/simx/types.h b/sim/simx/types.h index 6bba7f9c..d3fcfa1a 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) { case RegType::Integer: os << "x"; break; case RegType::Float: os << "f"; break; case RegType::Vector: os << "v"; break; + default: assert(false); } return os; } @@ -112,6 +113,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) { case AluType::SYSCALL: os << "SYSCALL"; break; case AluType::IMUL: os << "IMUL"; break; case AluType::IDIV: os << "IDIV"; break; + default: assert(false); } return os; } @@ -129,6 +131,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { case LsuType::LOAD: os << "LOAD"; break; case LsuType::STORE: os << "STORE"; break; case LsuType::FENCE: os << "FENCE"; break; + default: assert(false); } return os; } @@ -146,6 +149,7 @@ inline std::ostream &operator<<(std::ostream &os, const AddrType& type) { case AddrType::Global: os << "Global"; break; case AddrType::Shared: os << "Shared"; break; case AddrType::IO: os << "IO"; break; + default: assert(false); } return os; } @@ -174,6 +178,7 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { case FpuType::FDIV: os << "FDIV"; break; case FpuType::FSQRT: os << "FSQRT"; break; case FpuType::FCVT: os << "FCVT"; break; + default: assert(false); } return os; } @@ -205,6 +210,7 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) { case SfuType::CSRRS: os << "CSRRS"; break; case SfuType::CSRRC: os << "CSRRC"; break; case SfuType::CMOV: os << "CMOV"; break; + default: assert(false); } return os; } @@ -220,6 +226,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) { switch (type) { case ArbiterType::Priority: os << "Priority"; break; case ArbiterType::RoundRobin: os << "RoundRobin"; break; + default: assert(false); } return os; } From e217bc2c23c673abc9d4ae67e7d5bed399396561 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 28 Dec 2023 12:12:11 -0800 Subject: [PATCH 3/4] adding tracking for SFU stalls --- hw/rtl/VX_cluster.sv | 145 ++-- hw/rtl/VX_define.vh | 869 ++++++++++++----------- hw/rtl/VX_gpu_pkg.sv | 5 +- hw/rtl/VX_socket.sv | 15 +- hw/rtl/VX_types.vh | 5 + hw/rtl/Vortex.sv | 24 +- hw/rtl/cache/VX_cache_cluster.sv | 5 +- hw/rtl/cache/VX_cache_define.vh | 12 + hw/rtl/core/VX_core.sv | 682 +++++++++--------- hw/rtl/core/VX_core_top.sv | 6 +- hw/rtl/core/VX_csr_data.sv | 171 ++--- hw/rtl/core/VX_csr_unit.sv | 2 - hw/rtl/core/VX_issue.sv | 3 +- hw/rtl/core/VX_scoreboard.sv | 136 ++-- hw/rtl/core/VX_sfu_unit.sv | 35 +- hw/rtl/interfaces/VX_pipeline_perf_if.sv | 9 +- runtime/common/utils.cpp | 81 ++- sim/simx/cluster.cpp | 21 +- sim/simx/cluster.h | 22 +- sim/simx/core.cpp | 95 ++- sim/simx/core.h | 18 +- sim/simx/processor.cpp | 4 +- sim/simx/processor_impl.h | 13 +- sim/simx/scoreboard.h | 7 +- sim/simx/socket.cpp | 19 +- sim/simx/socket.h | 12 +- tests/opencl/Makefile | 16 +- 27 files changed, 1266 insertions(+), 1166 deletions(-) diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 6de47c5f..2d220c24 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -43,7 +43,16 @@ module VX_cluster import VX_gpu_pkg::*; #( `ifdef SCOPE localparam scope_socket = 0; `SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS); -`endif +`endif + +`ifdef PERF_ENABLE + VX_mem_perf_if mem_perf_tmp_if(); + assign mem_perf_tmp_if.icache = 'x; + assign mem_perf_tmp_if.dcache = 'x; + assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; + assign mem_perf_tmp_if.smem = 'x; + assign mem_perf_tmp_if.mem = mem_perf_if.mem; +`endif `ifdef GBAR_ENABLE @@ -69,24 +78,68 @@ module VX_cluster import VX_gpu_pkg::*; #( .reset (gbar_reset), .gbar_bus_if (gbar_bus_if) ); -`endif -`ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_tmp_if(); - cache_perf_t perf_l2cache; - - assign mem_perf_tmp_if.icache = 'x; - assign mem_perf_tmp_if.dcache = 'x; - assign mem_perf_tmp_if.l2cache = perf_l2cache; - assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; - assign mem_perf_tmp_if.smem = 'x; - assign mem_perf_tmp_if.mem = mem_perf_if.mem; `endif VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_TAG_WIDTH) - ) l1_mem_bus_if[2](); + .DATA_SIZE (L2_WORD_SIZE), + .TAG_WIDTH (L2_TAG_WIDTH) + ) l2_mem_bus_if[L2_NUM_REQS](); + + VX_mem_bus_if #( + .DATA_SIZE (ICACHE_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) + ) per_socket_icache_mem_bus_if[`NUM_SOCKETS](); + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) + ) per_socket_dcache_mem_bus_if[`NUM_SOCKETS](); + + VX_mem_bus_if #( + .DATA_SIZE (ICACHE_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH) + ) icache_mem_bus_if[1](); + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH) + ) dcache_mem_bus_if[1](); + + `RESET_RELAY (l1_mem_arb_reset, reset); + + VX_mem_arb #( + .NUM_INPUTS (`NUM_SOCKETS), + .DATA_SIZE (ICACHE_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH), + .TAG_SEL_IDX (1), // Skip 0 for NC flag + .ARBITER ("R"), + .OUT_REG_REQ (2), + .OUT_REG_RSP (2) + ) icache_mem_arb ( + .clk (clk), + .reset (l1_mem_arb_reset), + .bus_in_if (per_socket_icache_mem_bus_if), + .bus_out_if (icache_mem_bus_if) + ); + + VX_mem_arb #( + .NUM_INPUTS (`NUM_SOCKETS), + .DATA_SIZE (DCACHE_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH), + .TAG_SEL_IDX (1), // Skip 0 for NC flag + .ARBITER ("R"), + .OUT_REG_REQ (2), + .OUT_REG_RSP (2) + ) dcache_mem_arb ( + .clk (clk), + .reset (l1_mem_arb_reset), + .bus_in_if (per_socket_dcache_mem_bus_if), + .bus_out_if (dcache_mem_bus_if) + ); + + `ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[ICACHE_MEM_ARB_IDX], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH); + `ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[DCACHE_MEM_ARB_IDX], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH); `RESET_RELAY (l2_reset, reset); @@ -113,67 +166,12 @@ module VX_cluster import VX_gpu_pkg::*; #( .clk (clk), .reset (l2_reset), `ifdef PERF_ENABLE - .cache_perf (perf_l2cache), + .cache_perf (mem_perf_tmp_if.l2cache), `endif - .core_bus_if (l1_mem_bus_if), + .core_bus_if (l2_mem_bus_if), .mem_bus_if (mem_bus_if) ); - VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) - ) per_socket_icache_mem_bus_if[`NUM_SOCKETS](); - - VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) - ) per_socket_dcache_mem_bus_if[`NUM_SOCKETS](); - - VX_mem_bus_if #( - .DATA_SIZE (ICACHE_LINE_SIZE), - .TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH) - ) icache_mem_bus_if[1](); - - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_LINE_SIZE), - .TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH) - ) dcache_mem_bus_if[1](); - - `RESET_RELAY (l1_mem_arb_reset, reset); - - VX_mem_arb #( - .NUM_INPUTS (`NUM_SOCKETS), - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH), - .TAG_SEL_IDX (1), // Skip 0 for NC flag - .ARBITER ("R"), - .OUT_REG_REQ (2), - .OUT_REG_RSP (2) - ) icache_mem_arb ( - .clk (clk), - .reset (l1_mem_arb_reset), - .bus_in_if (per_socket_icache_mem_bus_if), - .bus_out_if (icache_mem_bus_if) - ); - - VX_mem_arb #( - .NUM_INPUTS (`NUM_SOCKETS), - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH), - .TAG_SEL_IDX (1), // Skip 0 for NC flag - .ARBITER ("R"), - .OUT_REG_REQ (2), - .OUT_REG_RSP (2) - ) dcache_mem_arb ( - .clk (clk), - .reset (l1_mem_arb_reset), - .bus_in_if (per_socket_dcache_mem_bus_if), - .bus_out_if (dcache_mem_bus_if) - ); - - `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH); - `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH); - /////////////////////////////////////////////////////////////////////////// wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak; @@ -201,6 +199,7 @@ module VX_cluster import VX_gpu_pkg::*; #( .SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i) ) socket ( `SCOPE_IO_BIND (scope_socket+i) + .clk (clk), .reset (socket_reset), @@ -212,7 +211,7 @@ module VX_cluster import VX_gpu_pkg::*; #( .icache_mem_bus_if (per_socket_icache_mem_bus_if[i]), .dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]), - + `ifdef GBAR_ENABLE .gbar_bus_if (per_socket_gbar_bus_if[i]), `endif diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index f39e7fea..63f2d42d 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -1,432 +1,437 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`ifndef VX_DEFINE_VH -`define VX_DEFINE_VH - -`include "VX_platform.vh" -`include "VX_config.vh" -`include "VX_types.vh" - -/////////////////////////////////////////////////////////////////////////////// - -`define NW_BITS `CLOG2(`NUM_WARPS) -`define NC_WIDTH `UP(`NC_BITS) - -`define NT_BITS `CLOG2(`NUM_THREADS) -`define NW_WIDTH `UP(`NW_BITS) - -`define NC_BITS `CLOG2(`NUM_CORES) -`define NT_WIDTH `UP(`NT_BITS) - -`define NB_BITS `CLOG2(`NUM_BARRIERS) -`define NB_WIDTH `UP(`NB_BITS) - -`define NUM_IREGS 32 - -`define NRI_BITS `CLOG2(`NUM_IREGS) - -`ifdef EXT_F_ENABLE -`define NUM_REGS (2 * `NUM_IREGS) -`else -`define NUM_REGS `NUM_IREGS -`endif - -`define NR_BITS `CLOG2(`NUM_REGS) - -`define PERF_CTR_BITS 44 - -`ifndef NDEBUG -`define UUID_WIDTH 44 -`else -`define UUID_WIDTH 1 -`endif - -/////////////////////////////////////////////////////////////////////////////// - -`define EX_ALU 0 -`define EX_LSU 1 -`define EX_SFU 2 -`define EX_FPU 3 - -`define NUM_EX_UNITS (3 + `EXT_F_ENABLED) -`define EX_BITS `CLOG2(`NUM_EX_UNITS) - -/////////////////////////////////////////////////////////////////////////////// - -`define INST_LUI 7'b0110111 -`define INST_AUIPC 7'b0010111 -`define INST_JAL 7'b1101111 -`define INST_JALR 7'b1100111 -`define INST_B 7'b1100011 // branch instructions -`define INST_L 7'b0000011 // load instructions -`define INST_S 7'b0100011 // store instructions -`define INST_I 7'b0010011 // immediate instructions -`define INST_R 7'b0110011 // register instructions -`define INST_FENCE 7'b0001111 // Fence instructions -`define INST_SYS 7'b1110011 // system instructions - -// RV64I instruction specific opcodes (for any W instruction) -`define INST_I_W 7'b0011011 // W type immediate instructions -`define INST_R_W 7'b0111011 // W type register instructions - -`define INST_FL 7'b0000111 // float load instruction -`define INST_FS 7'b0100111 // float store instruction -`define INST_FMADD 7'b1000011 -`define INST_FMSUB 7'b1000111 -`define INST_FNMSUB 7'b1001011 -`define INST_FNMADD 7'b1001111 -`define INST_FCI 7'b1010011 // float common instructions - -// Custom extension opcodes -`define INST_EXT1 7'b0001011 // 0x0B -`define INST_EXT2 7'b0101011 // 0x2B -`define INST_EXT3 7'b1011011 // 0x5B -`define INST_EXT4 7'b1111011 // 0x7B - -/////////////////////////////////////////////////////////////////////////////// - -`define INST_FRM_RNE 3'b000 // round to nearest even -`define INST_FRM_RTZ 3'b001 // round to zero -`define INST_FRM_RDN 3'b010 // round to -inf -`define INST_FRM_RUP 3'b011 // round to +inf -`define INST_FRM_RMM 3'b100 // round to nearest max magnitude -`define INST_FRM_DYN 3'b111 // dynamic mode -`define INST_FRM_BITS 3 - -/////////////////////////////////////////////////////////////////////////////// - -`define INST_OP_BITS 4 -`define INST_MOD_BITS 3 -`define INST_FMT_BITS 2 - -/////////////////////////////////////////////////////////////////////////////// - -`define INST_ALU_ADD 4'b0000 -`define INST_ALU_LUI 4'b0010 -`define INST_ALU_AUIPC 4'b0011 -`define INST_ALU_SLTU 4'b0100 -`define INST_ALU_SLT 4'b0101 -`define INST_ALU_SUB 4'b0111 -`define INST_ALU_SRL 4'b1000 -`define INST_ALU_SRA 4'b1001 -`define INST_ALU_AND 4'b1100 -`define INST_ALU_OR 4'b1101 -`define INST_ALU_XOR 4'b1110 -`define INST_ALU_SLL 4'b1111 -`define INST_ALU_OTHER 4'b0111 -`define INST_ALU_BITS 4 -`define INST_ALU_CLASS(op) op[3:2] -`define INST_ALU_SIGNED(op) op[0] -`define INST_ALU_IS_SUB(op) op[1] -`define INST_ALU_IS_BR(mod) mod[0] -`define INST_ALU_IS_M(mod) mod[1] -`define INST_ALU_IS_W(mod) mod[2] - -`define INST_BR_EQ 4'b0000 -`define INST_BR_NE 4'b0010 -`define INST_BR_LTU 4'b0100 -`define INST_BR_GEU 4'b0110 -`define INST_BR_LT 4'b0101 -`define INST_BR_GE 4'b0111 -`define INST_BR_JAL 4'b1000 -`define INST_BR_JALR 4'b1001 -`define INST_BR_ECALL 4'b1010 -`define INST_BR_EBREAK 4'b1011 -`define INST_BR_URET 4'b1100 -`define INST_BR_SRET 4'b1101 -`define INST_BR_MRET 4'b1110 -`define INST_BR_OTHER 4'b1111 -`define INST_BR_BITS 4 -`define INST_BR_CLASS(op) {1'b0, ~op[3]} -`define INST_BR_IS_NEG(op) op[1] -`define INST_BR_IS_LESS(op) op[2] -`define INST_BR_IS_STATIC(op) op[3] - -`define INST_M_MUL 3'b000 -`define INST_M_MULHU 3'b001 -`define INST_M_MULH 3'b010 -`define INST_M_MULHSU 3'b011 -`define INST_M_DIV 3'b100 -`define INST_M_DIVU 3'b101 -`define INST_M_REM 3'b110 -`define INST_M_REMU 3'b111 -`define INST_M_BITS 3 -`define INST_M_SIGNED(op) (~op[0]) -`define INST_M_IS_MULX(op) (~op[2]) -`define INST_M_IS_MULH(op) (op[1:0] != 0) -`define INST_M_SIGNED_A(op) (op[1:0] != 1) -`define INST_M_IS_REM(op) op[1] - -`define INST_FMT_B 3'b000 -`define INST_FMT_H 3'b001 -`define INST_FMT_W 3'b010 -`define INST_FMT_D 3'b011 -`define INST_FMT_BU 3'b100 -`define INST_FMT_HU 3'b101 -`define INST_FMT_WU 3'b110 - -`define INST_LSU_LB 4'b0000 -`define INST_LSU_LH 4'b0001 -`define INST_LSU_LW 4'b0010 -`define INST_LSU_LD 4'b0011 // new for RV64I LD -`define INST_LSU_LBU 4'b0100 -`define INST_LSU_LHU 4'b0101 -`define INST_LSU_LWU 4'b0110 // new for RV64I LWU -`define INST_LSU_SB 4'b1000 -`define INST_LSU_SH 4'b1001 -`define INST_LSU_SW 4'b1010 -`define INST_LSU_SD 4'b1011 // new for RV64I SD -`define INST_LSU_FENCE 4'b1111 -`define INST_LSU_BITS 4 -`define INST_LSU_FMT(op) op[2:0] -`define INST_LSU_WSIZE(op) op[1:0] -`define INST_LSU_IS_FENCE(op) (op[3:2] == 3) - -`define INST_FENCE_BITS 1 -`define INST_FENCE_D 1'h0 -`define INST_FENCE_I 1'h1 - -`define INST_FPU_ADD 4'b0000 -`define INST_FPU_SUB 4'b0001 -`define INST_FPU_MUL 4'b0010 -`define INST_FPU_DIV 4'b0011 -`define INST_FPU_SQRT 4'b0100 -`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2 -`define INST_FPU_F2F 4'b0110 -`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7 -`define INST_FPU_F2I 4'b1000 -`define INST_FPU_F2U 4'b1001 -`define INST_FPU_I2F 4'b1010 -`define INST_FPU_U2F 4'b1011 -`define INST_FPU_MADD 4'b1100 -`define INST_FPU_MSUB 4'b1101 -`define INST_FPU_NMSUB 4'b1110 -`define INST_FPU_NMADD 4'b1111 -`define INST_FPU_BITS 4 -`define INST_FPU_IS_W(mod) (mod[4]) -`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3) -`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4) - -`define INST_SFU_TMC 4'h0 -`define INST_SFU_WSPAWN 4'h1 -`define INST_SFU_SPLIT 4'h2 -`define INST_SFU_JOIN 4'h3 -`define INST_SFU_BAR 4'h4 -`define INST_SFU_PRED 4'h5 -`define INST_SFU_CSRRW 4'h6 -`define INST_SFU_CSRRS 4'h7 -`define INST_SFU_CSRRC 4'h8 -`define INST_SFU_CMOV 4'h9 -`define INST_SFU_BITS 4 -`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1) -`define INST_SFU_IS_WCTL(op) (op <= 5) -`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) - -/////////////////////////////////////////////////////////////////////////////// - -// non-cacheable tag bits -`define NC_TAG_BITS 1 - -// cache address type bits -`ifdef SM_ENABLE -`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1) -`else -`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS -`endif - -`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0) - -/////////////////////////////////////////////////////////////////////////////// - -`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \ - (`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS) - -`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \ - (`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width) - -`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \ - (`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS) - -`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \ - `MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) - -/////////////////////////////////////////////////////////////////////////////// - -`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \ - (tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches))) - -`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \ - (tag_width + `ARB_SEL_BITS(`UP(num_caches), 1)) - -`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \ - `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches) - -`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ - `CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches) - -`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ - `CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches) - -`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ - `CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches) - -/////////////////////////////////////////////////////////////////////////////// - -`ifdef L2_ENABLE -`define L2_LINE_SIZE `MEM_BLOCK_SIZE -`else -`define L2_LINE_SIZE `L1_LINE_SIZE -`endif - -`ifdef L3_ENABLE -`define L3_LINE_SIZE `MEM_BLOCK_SIZE -`else -`define L3_LINE_SIZE `L2_LINE_SIZE -`endif - -`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE -`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE)) -`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8) -`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH - -`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS -`define VX_DCR_DATA_WIDTH 32 - -`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)} - -/////////////////////////////////////////////////////////////////////////////// - -`define BUFFER_EX(dst, src, ena, latency) \ - VX_pipe_register #( \ - .DATAW ($bits(dst)), \ - .RESETW ($bits(dst)), \ - .DEPTH (latency) \ - ) __``dst ( \ - .clk (clk), \ - .reset (reset), \ - .enable (ena), \ - .data_in (src), \ - .data_out (dst) \ - ) - -`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1) - -`define POP_COUNT_EX(out, in, model) \ - VX_popcount #( \ - .N ($bits(in)), \ - .MODEL (model) \ - ) __``out ( \ - .data_in (in), \ - .data_out (out) \ - ) - -`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1) - -`define ASSIGN_VX_MEM_BUS_IF(dst, src) \ - assign dst.req_valid = src.req_valid; \ - assign dst.req_data = src.req_data; \ - assign src.req_ready = dst.req_ready; \ - assign src.rsp_valid = dst.rsp_valid; \ - assign src.rsp_data = dst.rsp_data; \ - assign dst.rsp_ready = src.rsp_ready - -`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \ - assign dst.req_valid = src.req_valid; \ - assign dst.req_data.rw = src.req_data.rw; \ - assign dst.req_data.byteen = src.req_data.byteen; \ - assign dst.req_data.addr = src.req_data.addr; \ - assign dst.req_data.data = src.req_data.data; \ - if (TD != TS) \ - assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \ - else \ - assign dst.req_data.tag = src.req_data.tag; \ - assign src.req_ready = dst.req_ready; \ - assign src.rsp_valid = dst.rsp_valid; \ - assign src.rsp_data.data = dst.rsp_data.data; \ - assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \ - assign dst.rsp_ready = src.rsp_ready - -`define BUFFER_DCR_BUS_IF(dst, src, enable) \ - logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \ - if (enable) begin \ - always @(posedge clk) begin \ - __``dst <= {src.write_valid, src.write_addr, src.write_data}; \ - end \ - end else begin \ - assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \ - end \ - VX_dcr_bus_if dst(); \ - assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst - -`define PERF_REDUCE(dst, src, field, width, count) \ - wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \ - wire [width-1:0] __reduce_add_o_``dst``field; \ - reg [width-1:0] __reduce_add_r_``dst``field; \ - for (genvar __i = 0; __i < count; ++__i) begin \ - assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \ - end \ - VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \ - __reduce_add_i_``src``field, \ - __reduce_add_o_``dst``field \ - ); \ - always @(posedge clk) begin \ - if (reset) begin \ - __reduce_add_r_``dst``field <= '0; \ - end else begin \ - __reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \ - end \ - end \ - assign ``dst.``field = __reduce_add_r_``dst``field - -`define PERF_CACHE_REDUCE(dst, src, count) \ - `PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \ - `PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \ - `PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \ - `PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \ - `PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \ - `PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \ - `PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \ - `PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count) - -`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \ - if (block_size != 1) begin \ - if (block_size != `NUM_WARPS) begin \ - assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \ - end else begin \ - assign dst = `NW_WIDTH'(block_idx); \ - end \ - end else begin \ - assign dst = src; \ - end - -`define TO_DISPATCH_DATA(data, tid) { \ - data.uuid, \ - data.wis, \ - data.tmask, \ - data.op_type, \ - data.op_mod, \ - data.wb, \ - data.use_PC, \ - data.use_imm, \ - data.PC, \ - data.imm, \ - data.rd, \ - tid, \ - data.rs1_data, \ - data.rs2_data, \ - data.rs3_data} - -/////////////////////////////////////////////////////////////////////////////// - -`endif // VX_DEFINE_VH +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`ifndef VX_DEFINE_VH +`define VX_DEFINE_VH + +`include "VX_platform.vh" +`include "VX_config.vh" +`include "VX_types.vh" + +/////////////////////////////////////////////////////////////////////////////// + +`define NW_BITS `CLOG2(`NUM_WARPS) +`define NC_WIDTH `UP(`NC_BITS) + +`define NT_BITS `CLOG2(`NUM_THREADS) +`define NW_WIDTH `UP(`NW_BITS) + +`define NC_BITS `CLOG2(`NUM_CORES) +`define NT_WIDTH `UP(`NT_BITS) + +`define NB_BITS `CLOG2(`NUM_BARRIERS) +`define NB_WIDTH `UP(`NB_BITS) + +`define NUM_IREGS 32 + +`define NRI_BITS `CLOG2(`NUM_IREGS) + +`ifdef EXT_F_ENABLE +`define NUM_REGS (2 * `NUM_IREGS) +`else +`define NUM_REGS `NUM_IREGS +`endif + +`define NR_BITS `CLOG2(`NUM_REGS) + +`define PERF_CTR_BITS 44 + +`ifndef NDEBUG +`define UUID_WIDTH 44 +`else +`define UUID_WIDTH 1 +`endif + +/////////////////////////////////////////////////////////////////////////////// + +`define EX_ALU 0 +`define EX_LSU 1 +`define EX_SFU 2 +`define EX_FPU (`EX_SFU + `EXT_F_ENABLED) + +`define NUM_EX_UNITS (3 + `EXT_F_ENABLED) +`define EX_BITS `CLOG2(`NUM_EX_UNITS) +`define EX_WIDTH `UP(`EX_BITS) + +`define SFU_CSRS 0 +`define SFU_WCTL 1 + +`define NUM_SFU_UNITS (2) +`define SFU_BITS `CLOG2(`NUM_SFU_UNITS) +`define SFU_WIDTH `UP(`SFU_BITS) + +/////////////////////////////////////////////////////////////////////////////// + +`define INST_LUI 7'b0110111 +`define INST_AUIPC 7'b0010111 +`define INST_JAL 7'b1101111 +`define INST_JALR 7'b1100111 +`define INST_B 7'b1100011 // branch instructions +`define INST_L 7'b0000011 // load instructions +`define INST_S 7'b0100011 // store instructions +`define INST_I 7'b0010011 // immediate instructions +`define INST_R 7'b0110011 // register instructions +`define INST_FENCE 7'b0001111 // Fence instructions +`define INST_SYS 7'b1110011 // system instructions + +// RV64I instruction specific opcodes (for any W instruction) +`define INST_I_W 7'b0011011 // W type immediate instructions +`define INST_R_W 7'b0111011 // W type register instructions + +`define INST_FL 7'b0000111 // float load instruction +`define INST_FS 7'b0100111 // float store instruction +`define INST_FMADD 7'b1000011 +`define INST_FMSUB 7'b1000111 +`define INST_FNMSUB 7'b1001011 +`define INST_FNMADD 7'b1001111 +`define INST_FCI 7'b1010011 // float common instructions + +// Custom extension opcodes +`define INST_EXT1 7'b0001011 // 0x0B +`define INST_EXT2 7'b0101011 // 0x2B +`define INST_EXT3 7'b1011011 // 0x5B +`define INST_EXT4 7'b1111011 // 0x7B + +/////////////////////////////////////////////////////////////////////////////// + +`define INST_FRM_RNE 3'b000 // round to nearest even +`define INST_FRM_RTZ 3'b001 // round to zero +`define INST_FRM_RDN 3'b010 // round to -inf +`define INST_FRM_RUP 3'b011 // round to +inf +`define INST_FRM_RMM 3'b100 // round to nearest max magnitude +`define INST_FRM_DYN 3'b111 // dynamic mode +`define INST_FRM_BITS 3 + +/////////////////////////////////////////////////////////////////////////////// + +`define INST_OP_BITS 4 +`define INST_MOD_BITS 3 +`define INST_FMT_BITS 2 + +/////////////////////////////////////////////////////////////////////////////// + +`define INST_ALU_ADD 4'b0000 +`define INST_ALU_LUI 4'b0010 +`define INST_ALU_AUIPC 4'b0011 +`define INST_ALU_SLTU 4'b0100 +`define INST_ALU_SLT 4'b0101 +`define INST_ALU_SUB 4'b0111 +`define INST_ALU_SRL 4'b1000 +`define INST_ALU_SRA 4'b1001 +`define INST_ALU_AND 4'b1100 +`define INST_ALU_OR 4'b1101 +`define INST_ALU_XOR 4'b1110 +`define INST_ALU_SLL 4'b1111 +`define INST_ALU_OTHER 4'b0111 +`define INST_ALU_BITS 4 +`define INST_ALU_CLASS(op) op[3:2] +`define INST_ALU_SIGNED(op) op[0] +`define INST_ALU_IS_SUB(op) op[1] +`define INST_ALU_IS_BR(mod) mod[0] +`define INST_ALU_IS_M(mod) mod[1] +`define INST_ALU_IS_W(mod) mod[2] + +`define INST_BR_EQ 4'b0000 +`define INST_BR_NE 4'b0010 +`define INST_BR_LTU 4'b0100 +`define INST_BR_GEU 4'b0110 +`define INST_BR_LT 4'b0101 +`define INST_BR_GE 4'b0111 +`define INST_BR_JAL 4'b1000 +`define INST_BR_JALR 4'b1001 +`define INST_BR_ECALL 4'b1010 +`define INST_BR_EBREAK 4'b1011 +`define INST_BR_URET 4'b1100 +`define INST_BR_SRET 4'b1101 +`define INST_BR_MRET 4'b1110 +`define INST_BR_OTHER 4'b1111 +`define INST_BR_BITS 4 +`define INST_BR_CLASS(op) {1'b0, ~op[3]} +`define INST_BR_IS_NEG(op) op[1] +`define INST_BR_IS_LESS(op) op[2] +`define INST_BR_IS_STATIC(op) op[3] + +`define INST_M_MUL 3'b000 +`define INST_M_MULHU 3'b001 +`define INST_M_MULH 3'b010 +`define INST_M_MULHSU 3'b011 +`define INST_M_DIV 3'b100 +`define INST_M_DIVU 3'b101 +`define INST_M_REM 3'b110 +`define INST_M_REMU 3'b111 +`define INST_M_BITS 3 +`define INST_M_SIGNED(op) (~op[0]) +`define INST_M_IS_MULX(op) (~op[2]) +`define INST_M_IS_MULH(op) (op[1:0] != 0) +`define INST_M_SIGNED_A(op) (op[1:0] != 1) +`define INST_M_IS_REM(op) op[1] + +`define INST_FMT_B 3'b000 +`define INST_FMT_H 3'b001 +`define INST_FMT_W 3'b010 +`define INST_FMT_D 3'b011 +`define INST_FMT_BU 3'b100 +`define INST_FMT_HU 3'b101 +`define INST_FMT_WU 3'b110 + +`define INST_LSU_LB 4'b0000 +`define INST_LSU_LH 4'b0001 +`define INST_LSU_LW 4'b0010 +`define INST_LSU_LD 4'b0011 // new for RV64I LD +`define INST_LSU_LBU 4'b0100 +`define INST_LSU_LHU 4'b0101 +`define INST_LSU_LWU 4'b0110 // new for RV64I LWU +`define INST_LSU_SB 4'b1000 +`define INST_LSU_SH 4'b1001 +`define INST_LSU_SW 4'b1010 +`define INST_LSU_SD 4'b1011 // new for RV64I SD +`define INST_LSU_FENCE 4'b1111 +`define INST_LSU_BITS 4 +`define INST_LSU_FMT(op) op[2:0] +`define INST_LSU_WSIZE(op) op[1:0] +`define INST_LSU_IS_FENCE(op) (op[3:2] == 3) + +`define INST_FENCE_BITS 1 +`define INST_FENCE_D 1'h0 +`define INST_FENCE_I 1'h1 + +`define INST_FPU_ADD 4'b0000 +`define INST_FPU_SUB 4'b0001 +`define INST_FPU_MUL 4'b0010 +`define INST_FPU_DIV 4'b0011 +`define INST_FPU_SQRT 4'b0100 +`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2 +`define INST_FPU_F2F 4'b0110 +`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7 +`define INST_FPU_F2I 4'b1000 +`define INST_FPU_F2U 4'b1001 +`define INST_FPU_I2F 4'b1010 +`define INST_FPU_U2F 4'b1011 +`define INST_FPU_MADD 4'b1100 +`define INST_FPU_MSUB 4'b1101 +`define INST_FPU_NMSUB 4'b1110 +`define INST_FPU_NMADD 4'b1111 +`define INST_FPU_BITS 4 +`define INST_FPU_IS_W(mod) (mod[4]) +`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3) +`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4) + +`define INST_SFU_TMC 4'h0 +`define INST_SFU_WSPAWN 4'h1 +`define INST_SFU_SPLIT 4'h2 +`define INST_SFU_JOIN 4'h3 +`define INST_SFU_BAR 4'h4 +`define INST_SFU_PRED 4'h5 +`define INST_SFU_CSRRW 4'h6 +`define INST_SFU_CSRRS 4'h7 +`define INST_SFU_CSRRC 4'h8 +`define INST_SFU_CMOV 4'h9 +`define INST_SFU_BITS 4 +`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1) +`define INST_SFU_IS_WCTL(op) (op <= 5) +`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) + +/////////////////////////////////////////////////////////////////////////////// + +// non-cacheable tag bits +`define NC_TAG_BITS 1 + +// cache address type bits +`ifdef SM_ENABLE +`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1) +`else +`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS +`endif + +`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0) + +/////////////////////////////////////////////////////////////////////////////// + +`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \ + (`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS) + +`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \ + (`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width) + +`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \ + (`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS) + +`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \ + `MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + +/////////////////////////////////////////////////////////////////////////////// + +`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \ + (tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches))) + +`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \ + (tag_width + `ARB_SEL_BITS(`UP(num_caches), 1)) + +`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \ + `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches) + +`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ + `CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches) + +`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ + `CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches) + +`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ + `CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches) + +/////////////////////////////////////////////////////////////////////////////// + +`ifdef L2_ENABLE +`define L2_LINE_SIZE `MEM_BLOCK_SIZE +`else +`define L2_LINE_SIZE `L1_LINE_SIZE +`endif + +`ifdef L3_ENABLE +`define L3_LINE_SIZE `MEM_BLOCK_SIZE +`else +`define L3_LINE_SIZE `L2_LINE_SIZE +`endif + +`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE +`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE)) +`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8) +`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH + +`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS +`define VX_DCR_DATA_WIDTH 32 + +`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)} + +/////////////////////////////////////////////////////////////////////////////// + +`define BUFFER_EX(dst, src, ena, latency) \ + VX_pipe_register #( \ + .DATAW ($bits(dst)), \ + .RESETW ($bits(dst)), \ + .DEPTH (latency) \ + ) __``dst ( \ + .clk (clk), \ + .reset (reset), \ + .enable (ena), \ + .data_in (src), \ + .data_out (dst) \ + ) + +`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1) + +`define POP_COUNT_EX(out, in, model) \ + VX_popcount #( \ + .N ($bits(in)), \ + .MODEL (model) \ + ) __``out ( \ + .data_in (in), \ + .data_out (out) \ + ) + +`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1) + +`define ASSIGN_VX_MEM_BUS_IF(dst, src) \ + assign dst.req_valid = src.req_valid; \ + assign dst.req_data = src.req_data; \ + assign src.req_ready = dst.req_ready; \ + assign src.rsp_valid = dst.rsp_valid; \ + assign src.rsp_data = dst.rsp_data; \ + assign dst.rsp_ready = src.rsp_ready + +`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \ + assign dst.req_valid = src.req_valid; \ + assign dst.req_data.rw = src.req_data.rw; \ + assign dst.req_data.byteen = src.req_data.byteen; \ + assign dst.req_data.addr = src.req_data.addr; \ + assign dst.req_data.data = src.req_data.data; \ + if (TD != TS) \ + assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \ + else \ + assign dst.req_data.tag = src.req_data.tag; \ + assign src.req_ready = dst.req_ready; \ + assign src.rsp_valid = dst.rsp_valid; \ + assign src.rsp_data.data = dst.rsp_data.data; \ + assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \ + assign dst.rsp_ready = src.rsp_ready + +`define BUFFER_DCR_BUS_IF(dst, src, enable) \ + logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \ + if (enable) begin \ + always @(posedge clk) begin \ + __``dst <= {src.write_valid, src.write_addr, src.write_data}; \ + end \ + end else begin \ + assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \ + end \ + VX_dcr_bus_if dst(); \ + assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst + +`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \ + for (genvar __d = 0; __d < dst_count; ++__d) begin \ + localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \ + wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \ + wire [width-1:0] __reduce_add_o_``dst``field; \ + for (genvar __i = 0; __i < __count; ++__i) begin \ + assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \ + end \ + VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \ + __reduce_add_i_``src``field, \ + __reduce_add_o_``dst``field \ + ); \ + if (reg_enable) begin \ + reg [width-1:0] __reduce_add_r_``dst``field; \ + always @(posedge clk) begin \ + if (reset) begin \ + __reduce_add_r_``dst``field <= '0; \ + end else begin \ + __reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \ + end \ + end \ + assign ``dst[__d].``field = __reduce_add_r_``dst``field; \ + end else begin \ + assign ``dst[__d].``field = __reduce_add_o_``dst``field; \ + end \ + end + +`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \ + if (block_size != 1) begin \ + if (block_size != `NUM_WARPS) begin \ + assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \ + end else begin \ + assign dst = `NW_WIDTH'(block_idx); \ + end \ + end else begin \ + assign dst = src; \ + end + +`define TO_DISPATCH_DATA(data, tid) { \ + data.uuid, \ + data.wis, \ + data.tmask, \ + data.op_type, \ + data.op_mod, \ + data.wb, \ + data.use_PC, \ + data.use_imm, \ + data.PC, \ + data.imm, \ + data.rd, \ + tid, \ + data.rs1_data, \ + data.rs2_data, \ + data.rs3_data} + +/////////////////////////////////////////////////////////////////////////////// + +`endif // VX_DEFINE_VH diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index b32b9600..94fe7684 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -99,7 +99,7 @@ package VX_gpu_pkg; `ifdef ICACHE_ENABLE localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES); `else - localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES); + localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES); `endif ////////////////////////// Dcache Parameters ////////////////////////////// @@ -147,6 +147,9 @@ package VX_gpu_pkg; /////////////////////////////// L2 Parameters ///////////////////////////// + localparam ICACHE_MEM_ARB_IDX = 0; + localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1; + // Word size in bytes localparam L2_WORD_SIZE = `L1_LINE_SIZE; diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index 74a074d1..31a01b50 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -66,18 +66,11 @@ module VX_socket import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE VX_mem_perf_if mem_perf_tmp_if(); - cache_perf_t perf_icache; - cache_perf_t perf_dcache; - - assign mem_perf_tmp_if.icache = perf_icache; - assign mem_perf_tmp_if.dcache = perf_dcache; assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; assign mem_perf_tmp_if.smem = 'x; assign mem_perf_tmp_if.mem = mem_perf_if.mem; -`endif - - +`endif /////////////////////////////////////////////////////////////////////////// @@ -110,7 +103,7 @@ module VX_socket import VX_gpu_pkg::*; #( .MEM_OUT_REG (2) ) icache ( `ifdef PERF_ENABLE - .cache_perf (perf_icache), + .cache_perf (mem_perf_tmp_if.icache), `endif .clk (clk), .reset (icache_reset), @@ -121,7 +114,7 @@ module VX_socket import VX_gpu_pkg::*; #( /////////////////////////////////////////////////////////////////////////// VX_mem_bus_if #( - .DATA_SIZE (DCACHE_WORD_SIZE), + .DATA_SIZE (DCACHE_WORD_SIZE), .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) ) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS](); @@ -150,7 +143,7 @@ module VX_socket import VX_gpu_pkg::*; #( .MEM_OUT_REG (2) ) dcache ( `ifdef PERF_ENABLE - .cache_perf (perf_dcache), + .cache_perf (mem_perf_tmp_if.dcache), `endif .clk (clk), .reset (dcache_reset), diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index a5044ccf..9e875d21 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -97,6 +97,11 @@ `define VX_CSR_MPM_IFETCH_LT_H 12'hB8E `define VX_CSR_MPM_LOAD_LT 12'hB0F `define VX_CSR_MPM_LOAD_LT_H 12'hB8F +// SFU: scoreboard +`define VX_CSR_MPM_SCRB_WCTL 12'hB10 +`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90 +`define VX_CSR_MPM_SCRB_CSRS 12'hB11 +`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91 // Machine Performance-monitoring memory counters // PERF: icache diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index e9d068f7..a955c8f5 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -22,15 +22,15 @@ module Vortex import VX_gpu_pkg::*; ( // Memory request output wire mem_req_valid, - output wire mem_req_rw, - output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen, + output wire mem_req_rw, + output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen, output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr, output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data, output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag, input wire mem_req_ready, // Memory response - input wire mem_rsp_valid, + input wire mem_rsp_valid, input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data, input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag, output wire mem_rsp_ready, @@ -45,17 +45,11 @@ module Vortex import VX_gpu_pkg::*; ( ); `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_if(); - cache_perf_t perf_l3cache; - mem_perf_t mem_perf; - - assign mem_perf_if.smem = 'x; + VX_mem_perf_if mem_perf_if(); assign mem_perf_if.icache = 'x; assign mem_perf_if.dcache = 'x; assign mem_perf_if.l2cache = 'x; - assign mem_perf_if.l3cache = perf_l3cache; - assign mem_perf_if.mem = mem_perf; -`endif +`endif VX_mem_bus_if #( .DATA_SIZE (`L2_LINE_SIZE), @@ -93,7 +87,7 @@ module Vortex import VX_gpu_pkg::*; ( .reset (l3_reset), `ifdef PERF_ENABLE - .cache_perf (perf_l3cache), + .cache_perf (mem_perf_if.l3cache), `endif .core_bus_if (per_cluster_mem_bus_if), @@ -166,11 +160,12 @@ module Vortex import VX_gpu_pkg::*; ( ); end - `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1)); + `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1)); `ifdef PERF_ENABLE - reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; + reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; + mem_perf_t mem_perf; always @(posedge clk) begin if (reset) begin @@ -193,6 +188,7 @@ module Vortex import VX_gpu_pkg::*; ( mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; end end + assign mem_perf_if.mem = mem_perf; `endif diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index 18e26eb2..46447e4f 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -83,8 +83,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( `STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter")) `ifdef PERF_ENABLE - cache_perf_t perf_cache_unit[NUM_CACHES]; - `PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES); + cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES]; + `PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES) + assign cache_perf = perf_cache_tmp[0]; `endif VX_mem_bus_if #( diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 0bb675fa..7567025f 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -62,4 +62,16 @@ `define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))} `define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)} +/////////////////////////////////////////////////////////////////////////////// + +`define PERF_CACHE_ADD(dst, src, dcount, scount) \ + `PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \ + `PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \ + `PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \ + `PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \ + `PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \ + `PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \ + `PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \ + `PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) + `endif // VX_CACHE_DEFINE_VH diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 4d3ce297..dde085a8 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -1,344 +1,338 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_define.vh" - -`ifdef EXT_F_ENABLE -`include "VX_fpu_define.vh" -`endif - -module VX_core import VX_gpu_pkg::*; #( - parameter CORE_ID = 0 -) ( - `SCOPE_IO_DECL - - // Clock - input wire clk, - input wire reset, - -`ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_if, -`endif - - VX_dcr_bus_if.slave dcr_bus_if, - - VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS], - - VX_mem_bus_if.master icache_bus_if, - -`ifdef GBAR_ENABLE - VX_gbar_bus_if.master gbar_bus_if, -`endif - - // simulation helper signals - output wire sim_ebreak, - output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value, - - // Status - output wire busy -); - VX_schedule_if schedule_if(); - VX_fetch_if fetch_if(); - VX_decode_if decode_if(); - VX_sched_csr_if sched_csr_if(); - VX_decode_sched_if decode_sched_if(); - VX_commit_sched_if commit_sched_if(); - VX_commit_csr_if commit_csr_if(); - VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS](); - VX_warp_ctl_if warp_ctl_if(); - - VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH](); - VX_commit_if alu_commit_if[`ISSUE_WIDTH](); - - VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH](); - VX_commit_if lsu_commit_if[`ISSUE_WIDTH](); -`ifdef EXT_F_ENABLE - VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH](); - VX_commit_if fpu_commit_if[`ISSUE_WIDTH](); -`endif - VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH](); - VX_commit_if sfu_commit_if[`ISSUE_WIDTH](); - - VX_writeback_if writeback_if[`ISSUE_WIDTH](); - - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_TAG_WIDTH) - ) dcache_bus_tmp_if[DCACHE_NUM_REQS](); - -`ifdef PERF_ENABLE - VX_pipeline_perf_if pipeline_perf_if(); - VX_mem_perf_if mem_perf_tmp_if(); - - assign mem_perf_tmp_if.icache = mem_perf_if.icache; - assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; - assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; - assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; -`ifdef SM_ENABLE - cache_perf_t smem_perf; - assign mem_perf_tmp_if.smem = smem_perf; -`else - assign mem_perf_tmp_if.smem = '0; -`endif - assign mem_perf_tmp_if.mem = mem_perf_if.mem; -`endif - - `RESET_RELAY (dcr_data_reset, reset); - `RESET_RELAY (schedule_reset, reset); - `RESET_RELAY (fetch_reset, reset); - `RESET_RELAY (decode_reset, reset); - `RESET_RELAY (issue_reset, reset); - `RESET_RELAY (execute_reset, reset); - `RESET_RELAY (commit_reset, reset); - - base_dcrs_t base_dcrs; - - VX_dcr_data dcr_data ( - .clk (clk), - .reset (dcr_data_reset), - .dcr_bus_if (dcr_bus_if), - .base_dcrs (base_dcrs) - ); - - `SCOPE_IO_SWITCH (3) - - VX_schedule #( - .CORE_ID (CORE_ID) - ) schedule ( - .clk (clk), - .reset (schedule_reset), - - `ifdef PERF_ENABLE - .perf_schedule_if (pipeline_perf_if.schedule), - `endif - - .base_dcrs (base_dcrs), - - .warp_ctl_if (warp_ctl_if), - .branch_ctl_if (branch_ctl_if), - .decode_sched_if(decode_sched_if), - .commit_sched_if(commit_sched_if), - - .schedule_if (schedule_if), - `ifdef GBAR_ENABLE - .gbar_bus_if (gbar_bus_if), - `endif - .sched_csr_if (sched_csr_if), - - .busy (busy) - ); - - VX_fetch #( - .CORE_ID (CORE_ID) - ) fetch ( - `SCOPE_IO_BIND (0) - .clk (clk), - .reset (fetch_reset), - .icache_bus_if (icache_bus_if), - .schedule_if (schedule_if), - .fetch_if (fetch_if) - ); - - VX_decode #( - .CORE_ID (CORE_ID) - ) decode ( - .clk (clk), - .reset (decode_reset), - .fetch_if (fetch_if), - .decode_if (decode_if), - .decode_sched_if(decode_sched_if) - ); - - VX_issue #( - .CORE_ID (CORE_ID) - ) issue ( - `SCOPE_IO_BIND (1) - - .clk (clk), - .reset (issue_reset), - - `ifdef PERF_ENABLE - .perf_issue_if (pipeline_perf_if.issue), - `endif - - .decode_if (decode_if), - .writeback_if (writeback_if), - - .alu_dispatch_if(alu_dispatch_if), - .lsu_dispatch_if(lsu_dispatch_if), - `ifdef EXT_F_ENABLE - .fpu_dispatch_if(fpu_dispatch_if), - `endif - .sfu_dispatch_if(sfu_dispatch_if) - ); - - VX_execute #( - .CORE_ID (CORE_ID) - ) execute ( - `SCOPE_IO_BIND (2) - - .clk (clk), - .reset (execute_reset), - - .base_dcrs (base_dcrs), - - `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_tmp_if), - .pipeline_perf_if(pipeline_perf_if), - `endif - - .dcache_bus_if (dcache_bus_tmp_if), - - `ifdef EXT_F_ENABLE - .fpu_dispatch_if(fpu_dispatch_if), - .fpu_commit_if (fpu_commit_if), - `endif - - .commit_csr_if (commit_csr_if), - .sched_csr_if (sched_csr_if), - - .alu_dispatch_if(alu_dispatch_if), - .lsu_dispatch_if(lsu_dispatch_if), - .sfu_dispatch_if(sfu_dispatch_if), - - .warp_ctl_if (warp_ctl_if), - .branch_ctl_if (branch_ctl_if), - - .alu_commit_if (alu_commit_if), - .lsu_commit_if (lsu_commit_if), - .sfu_commit_if (sfu_commit_if), - - .sim_ebreak (sim_ebreak) - ); - - VX_commit #( - .CORE_ID (CORE_ID) - ) commit ( - .clk (clk), - .reset (commit_reset), - - .alu_commit_if (alu_commit_if), - .lsu_commit_if (lsu_commit_if), - `ifdef EXT_F_ENABLE - .fpu_commit_if (fpu_commit_if), - `endif - .sfu_commit_if (sfu_commit_if), - - .writeback_if (writeback_if), - - .commit_csr_if (commit_csr_if), - .commit_sched_if(commit_sched_if), - - .sim_wb_value (sim_wb_value) - ); - -`ifdef SM_ENABLE - - VX_smem_unit #( - .CORE_ID (CORE_ID) - ) smem_unit ( - .clk (clk), - .reset (reset), - `ifdef PERF_ENABLE - .cache_perf (smem_perf), - `endif - .dcache_bus_in_if (dcache_bus_tmp_if), - .dcache_bus_out_if (dcache_bus_if) - ); - -`else - - for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin - `ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]); - end - -`endif - -`ifdef PERF_ENABLE - - wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; - wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; - wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle; - - wire [1:0] perf_icache_pending_read_cycle; - wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle; - - reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; - reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; - - reg [`PERF_CTR_BITS-1:0] perf_ifetches; - reg [`PERF_CTR_BITS-1:0] perf_loads; - reg [`PERF_CTR_BITS-1:0] perf_stores; - - wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready; - wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; - - wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r; - wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r; - wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire; - - for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin - assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw; - assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw; - assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; - end - - `BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire); - `BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire); - - `POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r); - `POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r); - `POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire); - - assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire; - assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle; - - always @(posedge clk) begin - if (reset) begin - perf_icache_pending_reads <= '0; - perf_dcache_pending_reads <= '0; - end else begin - perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle)); - perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle)); - end - end - - reg [`PERF_CTR_BITS-1:0] perf_icache_lat; - reg [`PERF_CTR_BITS-1:0] perf_dcache_lat; - - always @(posedge clk) begin - if (reset) begin - perf_ifetches <= '0; - perf_loads <= '0; - perf_stores <= '0; - perf_icache_lat <= '0; - perf_dcache_lat <= '0; - end else begin - perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire); - perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle); - perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle); - perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads; - perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads; - end - end - - assign pipeline_perf_if.ifetches = perf_ifetches; - assign pipeline_perf_if.loads = perf_loads; - assign pipeline_perf_if.stores = perf_stores; - assign pipeline_perf_if.load_latency = perf_dcache_lat; - assign pipeline_perf_if.ifetch_latency = perf_icache_lat; - assign pipeline_perf_if.load_latency = perf_dcache_lat; - -`endif - -endmodule +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +`ifdef EXT_F_ENABLE +`include "VX_fpu_define.vh" +`endif + +module VX_core import VX_gpu_pkg::*; #( + parameter CORE_ID = 0 +) ( + `SCOPE_IO_DECL + + // Clock + input wire clk, + input wire reset, + +`ifdef PERF_ENABLE + VX_mem_perf_if.slave mem_perf_if, +`endif + + VX_dcr_bus_if.slave dcr_bus_if, + + VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS], + + VX_mem_bus_if.master icache_bus_if, + +`ifdef GBAR_ENABLE + VX_gbar_bus_if.master gbar_bus_if, +`endif + + // simulation helper signals + output wire sim_ebreak, + output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value, + + // Status + output wire busy +); + VX_schedule_if schedule_if(); + VX_fetch_if fetch_if(); + VX_decode_if decode_if(); + VX_sched_csr_if sched_csr_if(); + VX_decode_sched_if decode_sched_if(); + VX_commit_sched_if commit_sched_if(); + VX_commit_csr_if commit_csr_if(); + VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS](); + VX_warp_ctl_if warp_ctl_if(); + + VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH](); + VX_commit_if alu_commit_if[`ISSUE_WIDTH](); + + VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH](); + VX_commit_if lsu_commit_if[`ISSUE_WIDTH](); +`ifdef EXT_F_ENABLE + VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH](); + VX_commit_if fpu_commit_if[`ISSUE_WIDTH](); +`endif + VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH](); + VX_commit_if sfu_commit_if[`ISSUE_WIDTH](); + + VX_writeback_if writeback_if[`ISSUE_WIDTH](); + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_WORD_SIZE), + .TAG_WIDTH (DCACHE_TAG_WIDTH) + ) dcache_bus_tmp_if[DCACHE_NUM_REQS](); + +`ifdef PERF_ENABLE + VX_mem_perf_if mem_perf_tmp_if(); + VX_pipeline_perf_if pipeline_perf_if(); + + assign mem_perf_tmp_if.icache = mem_perf_if.icache; + assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; + assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; + assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; + assign mem_perf_tmp_if.mem = mem_perf_if.mem; +`endif + + `RESET_RELAY (dcr_data_reset, reset); + `RESET_RELAY (schedule_reset, reset); + `RESET_RELAY (fetch_reset, reset); + `RESET_RELAY (decode_reset, reset); + `RESET_RELAY (issue_reset, reset); + `RESET_RELAY (execute_reset, reset); + `RESET_RELAY (commit_reset, reset); + + base_dcrs_t base_dcrs; + + VX_dcr_data dcr_data ( + .clk (clk), + .reset (dcr_data_reset), + .dcr_bus_if (dcr_bus_if), + .base_dcrs (base_dcrs) + ); + + `SCOPE_IO_SWITCH (3) + + VX_schedule #( + .CORE_ID (CORE_ID) + ) schedule ( + .clk (clk), + .reset (schedule_reset), + + `ifdef PERF_ENABLE + .perf_schedule_if (pipeline_perf_if.schedule), + `endif + + .base_dcrs (base_dcrs), + + .warp_ctl_if (warp_ctl_if), + .branch_ctl_if (branch_ctl_if), + .decode_sched_if(decode_sched_if), + .commit_sched_if(commit_sched_if), + + .schedule_if (schedule_if), + `ifdef GBAR_ENABLE + .gbar_bus_if (gbar_bus_if), + `endif + .sched_csr_if (sched_csr_if), + + .busy (busy) + ); + + VX_fetch #( + .CORE_ID (CORE_ID) + ) fetch ( + `SCOPE_IO_BIND (0) + .clk (clk), + .reset (fetch_reset), + .icache_bus_if (icache_bus_if), + .schedule_if (schedule_if), + .fetch_if (fetch_if) + ); + + VX_decode #( + .CORE_ID (CORE_ID) + ) decode ( + .clk (clk), + .reset (decode_reset), + .fetch_if (fetch_if), + .decode_if (decode_if), + .decode_sched_if(decode_sched_if) + ); + + VX_issue #( + .CORE_ID (CORE_ID) + ) issue ( + `SCOPE_IO_BIND (1) + + .clk (clk), + .reset (issue_reset), + + `ifdef PERF_ENABLE + .perf_issue_if (pipeline_perf_if.issue), + `endif + + .decode_if (decode_if), + .writeback_if (writeback_if), + + .alu_dispatch_if(alu_dispatch_if), + .lsu_dispatch_if(lsu_dispatch_if), + `ifdef EXT_F_ENABLE + .fpu_dispatch_if(fpu_dispatch_if), + `endif + .sfu_dispatch_if(sfu_dispatch_if) + ); + + VX_execute #( + .CORE_ID (CORE_ID) + ) execute ( + `SCOPE_IO_BIND (2) + + .clk (clk), + .reset (execute_reset), + + .base_dcrs (base_dcrs), + + `ifdef PERF_ENABLE + .mem_perf_if (mem_perf_tmp_if), + .pipeline_perf_if(pipeline_perf_if), + `endif + + .dcache_bus_if (dcache_bus_tmp_if), + + `ifdef EXT_F_ENABLE + .fpu_dispatch_if(fpu_dispatch_if), + .fpu_commit_if (fpu_commit_if), + `endif + + .commit_csr_if (commit_csr_if), + .sched_csr_if (sched_csr_if), + + .alu_dispatch_if(alu_dispatch_if), + .lsu_dispatch_if(lsu_dispatch_if), + .sfu_dispatch_if(sfu_dispatch_if), + + .warp_ctl_if (warp_ctl_if), + .branch_ctl_if (branch_ctl_if), + + .alu_commit_if (alu_commit_if), + .lsu_commit_if (lsu_commit_if), + .sfu_commit_if (sfu_commit_if), + + .sim_ebreak (sim_ebreak) + ); + + VX_commit #( + .CORE_ID (CORE_ID) + ) commit ( + .clk (clk), + .reset (commit_reset), + + .alu_commit_if (alu_commit_if), + .lsu_commit_if (lsu_commit_if), + `ifdef EXT_F_ENABLE + .fpu_commit_if (fpu_commit_if), + `endif + .sfu_commit_if (sfu_commit_if), + + .writeback_if (writeback_if), + + .commit_csr_if (commit_csr_if), + .commit_sched_if(commit_sched_if), + + .sim_wb_value (sim_wb_value) + ); + +`ifdef SM_ENABLE + + VX_smem_unit #( + .CORE_ID (CORE_ID) + ) smem_unit ( + .clk (clk), + .reset (reset), + `ifdef PERF_ENABLE + .cache_perf (mem_perf_tmp_if.smem), + `endif + .dcache_bus_in_if (dcache_bus_tmp_if), + .dcache_bus_out_if (dcache_bus_if) + ); + +`else + + for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin + `ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]); + end + +`endif + +`ifdef PERF_ENABLE + + wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; + wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; + wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle; + + wire [1:0] perf_icache_pending_read_cycle; + wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle; + + reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; + reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; + + reg [`PERF_CTR_BITS-1:0] perf_ifetches; + reg [`PERF_CTR_BITS-1:0] perf_loads; + reg [`PERF_CTR_BITS-1:0] perf_stores; + + wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready; + wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; + + wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r; + wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r; + wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire; + + for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin + assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw; + assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw; + assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; + end + + `BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire); + `BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire); + + `POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r); + `POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r); + `POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire); + + assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire; + assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle; + + always @(posedge clk) begin + if (reset) begin + perf_icache_pending_reads <= '0; + perf_dcache_pending_reads <= '0; + end else begin + perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle)); + perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle)); + end + end + + reg [`PERF_CTR_BITS-1:0] perf_icache_lat; + reg [`PERF_CTR_BITS-1:0] perf_dcache_lat; + + always @(posedge clk) begin + if (reset) begin + perf_ifetches <= '0; + perf_loads <= '0; + perf_stores <= '0; + perf_icache_lat <= '0; + perf_dcache_lat <= '0; + end else begin + perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire); + perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle); + perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle); + perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads; + perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads; + end + end + + assign pipeline_perf_if.ifetches = perf_ifetches; + assign pipeline_perf_if.loads = perf_loads; + assign pipeline_perf_if.stores = perf_stores; + assign pipeline_perf_if.load_latency = perf_dcache_lat; + assign pipeline_perf_if.ifetch_latency = perf_icache_lat; + assign pipeline_perf_if.load_latency = perf_dcache_lat; + +`endif + +endmodule diff --git a/hw/rtl/core/VX_core_top.sv b/hw/rtl/core/VX_core_top.sv index 6ecd4772..83318086 100644 --- a/hw/rtl/core/VX_core_top.sv +++ b/hw/rtl/core/VX_core_top.sv @@ -129,12 +129,12 @@ module VX_core_top import VX_gpu_pkg::*; #( assign icache_rsp_ready = icache_bus_if.rsp_ready; `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_if(); - assign mem_perf_if.smem = '0; + VX_mem_perf_if mem_perf_if(); assign mem_perf_if.icache = '0; assign mem_perf_if.dcache = '0; assign mem_perf_if.l2cache = '0; - assign mem_perf_if.l3cache = '0; + assign mem_perf_if.l3cache = '0; + assign mem_perf_if.smem = '0; assign mem_perf_if.mem = '0; `endif diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 1b370260..b1e68437 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -33,7 +33,6 @@ import VX_fpu_pkg::*; `ifdef PERF_ENABLE VX_mem_perf_if.slave mem_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if, - VX_sfu_perf_if.slave sfu_perf_if, `endif VX_commit_csr_if.slave commit_csr_if, @@ -187,103 +186,107 @@ import VX_fpu_pkg::*; `VX_DCR_MPM_CLASS_CORE: begin case (read_addr) // PERF: pipeline - `VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0]; - `VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0]; - `VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; - `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; - `VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0]; + `VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0]; + `VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0]; + `VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; + `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; + `VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0]; + `VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]); `ifdef EXT_F_ENABLE - `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0]; + `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0]; + `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]); `else - `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0; - `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0; + `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0; + `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0; `endif - `VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0]; - `VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0]; + `VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0]; + `VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0]; + `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0]; + `VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0]; + `VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]); // PERF: memory - `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; - `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0]; - `VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0]; - `VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0]; - `VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0]; - `VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; + `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0]; + `VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0]; + `VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0]; + `VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0]; + `VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); default:; endcase end `VX_DCR_MPM_CLASS_MEM: begin case (read_addr) // PERF: icache - `VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0]; - `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; - `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0]; - `VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0]; + `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; + `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0]; + `VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: dcache - `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; - `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0]; - `VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0]; - `VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0]; - `VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0]; - `VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0]; - `VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; + `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0]; + `VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0]; + `VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0]; + `VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0]; + `VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0]; + `VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: smem - `VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0]; - `VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0]; - `VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0]; - `VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0]; + `VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0]; + `VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0]; + `VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]); // PERF: l2cache - `VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0]; - `VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0]; - `VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0]; - `VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0]; - `VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0]; - `VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0]; - `VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0]; + `VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0]; + `VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0]; + `VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0]; + `VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0]; + `VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0]; + `VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: l3cache - `VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0]; - `VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0]; - `VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0]; - `VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0]; - `VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0]; - `VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0]; - `VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0]; + `VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0]; + `VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0]; + `VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0]; + `VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0]; + `VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0]; + `VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: memory - `VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0]; - `VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0]; - `VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0]; - `VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0]; + `VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0]; + `VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0]; + `VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]); default:; endcase end @@ -303,8 +306,6 @@ import VX_fpu_pkg::*; `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid)) `ifdef PERF_ENABLE - wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls; - `UNUSED_VAR (perf_wctl_stalls); `UNUSED_VAR (mem_perf_if.icache); `UNUSED_VAR (mem_perf_if.smem); `endif diff --git a/hw/rtl/core/VX_csr_unit.sv b/hw/rtl/core/VX_csr_unit.sv index 14b633fa..91cb37ab 100644 --- a/hw/rtl/core/VX_csr_unit.sv +++ b/hw/rtl/core/VX_csr_unit.sv @@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE VX_mem_perf_if.slave mem_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if, - VX_sfu_perf_if.slave sfu_perf_if, `endif `ifdef EXT_F_ENABLE @@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE .mem_perf_if (mem_perf_if), .pipeline_perf_if(pipeline_perf_if), - .sfu_perf_if (sfu_perf_if), `endif .commit_csr_if (commit_csr_if), diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 912abc97..1ba4ca28 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -61,7 +61,8 @@ module VX_issue #( .reset (scoreboard_reset), `ifdef PERF_ENABLE .perf_scb_stalls(perf_issue_if.scb_stalls), - .perf_scb_uses (perf_issue_if.scb_uses), + .perf_units_uses(perf_issue_if.units_uses), + .perf_sfu_uses (perf_issue_if.sfu_uses), `endif .writeback_if (writeback_if), .ibuffer_if (ibuffer_if), diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 1c5f3676..a4792c8d 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -21,7 +21,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls, - output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS], + output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS], + output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS], `endif VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], @@ -32,21 +33,66 @@ module VX_scoreboard import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1; `ifdef PERF_ENABLE - wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle; - wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle; - reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_uses_per_cycle; - wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle; + reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle; + wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r; - `POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); + reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle; + wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r; + + wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle; + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r; + + `POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); VX_reduce #( .DATAW_IN (`NUM_EX_UNITS), .N (`ISSUE_WIDTH), .OP ("|") - ) reduce ( - .data_in (perf_issue_uses_per_cycle), - .data_out (perf_uses_per_cycle) + ) perf_units_reduce ( + .data_in (perf_issue_units_per_cycle), + .data_out (perf_units_per_cycle) + ); + + VX_reduce #( + .DATAW_IN (`NUM_SFU_UNITS), + .N (`ISSUE_WIDTH), + .OP ("|") + ) perf_sfu_reduce ( + .data_in (perf_issue_sfu_per_cycle), + .data_out (perf_sfu_per_cycle) ); + + `BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle); + `BUFFER(perf_units_per_cycle_r, perf_units_per_cycle); + `BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle); + + always @(posedge clk) begin + if (reset) begin + perf_scb_stalls <= '0; + end else begin + perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r); + end + end + + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin + always @(posedge clk) begin + if (reset) begin + perf_units_uses[i] <= '0; + end else begin + perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]); + end + end + end + + for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin + always @(posedge clk) begin + if (reset) begin + perf_sfu_uses[i] <= '0; + end else begin + perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]); + end + end + end `endif for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin @@ -60,21 +106,46 @@ module VX_scoreboard import VX_gpu_pkg::*; #( wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]; `ifdef PERF_ENABLE - reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units; + reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units; + reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu; + + reg [`SFU_WIDTH-1:0] sfu_type; always @(*) begin - perf_issue_uses_per_cycle[i] = '0; + case (scoreboard_if[i].data.op_type) + `INST_SFU_CSRRW, + `INST_SFU_CSRRS, + `INST_SFU_CSRRC: sfu_type = `SFU_CSRS; + default: sfu_type = `SFU_WCTL; + endcase + end + + always @(*) begin + perf_issue_units_per_cycle[i] = '0; + perf_issue_sfu_per_cycle[i] = '0; if (ibuffer_if[i].valid) begin if (inuse_rd) begin - perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; + perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; + if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin + perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; + end end if (inuse_rs1) begin - perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; + perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; + if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin + perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; + end end if (inuse_rs2) begin - perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; + perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; + if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin + perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; + end end if (inuse_rs3) begin - perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; + perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; + if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin + perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; + end end end end @@ -97,8 +168,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #( always @(posedge clk) begin if (reset) begin valid_out_r <= 0; - inuse_regs <= '0; - end else begin + inuse_regs <= '0; + end else begin if (writeback_fire) begin inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; end @@ -109,6 +180,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1; `ifdef PERF_ENABLE inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type; + if (scoreboard_if[i].data.ex_type == `EX_SFU) begin + inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type; + end `endif end valid_out_r <= 0; @@ -141,7 +215,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( timeout_ctr <= '0; end end - end + end `RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT), ("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)", @@ -153,32 +227,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #( $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid)); `endif - end - -`ifdef PERF_ENABLE - wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle_r; - wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle_r; - - `BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle); - `BUFFER(perf_uses_per_cycle_r, perf_uses_per_cycle); - - always @(posedge clk) begin - if (reset) begin - perf_scb_stalls <= '0; - end else begin - perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r); - end end - for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin - always @(posedge clk) begin - if (reset) begin - perf_scb_uses[i] <= '0; - end else begin - perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(perf_uses_per_cycle_r[i]); - end - end - end -`endif - endmodule diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index b531e75b..6fb2cb9f 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1; localparam RSP_ARB_SIZE = 1 + 1; localparam RSP_ARB_IDX_WCTL = 0; - localparam RSP_ARB_IDX_CSR = 1; + localparam RSP_ARB_IDX_CSRS = 1; VX_execute_if #( .NUM_LANES (NUM_LANES) @@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in; wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in; -`ifdef PERF_ENABLE - VX_sfu_perf_if sfu_perf_if(); -`endif // Warp control block VX_execute_if #( @@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE .mem_perf_if (mem_perf_if), .pipeline_perf_if(pipeline_perf_if), - .sfu_perf_if (sfu_perf_if), `endif `ifdef EXT_F_ENABLE @@ -141,21 +137,21 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( .commit_if (csr_commit_if) ); - assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid; - assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data; - assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR]; + assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid; + assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data; + assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS]; // can accept new request? reg sfu_req_ready; always @(*) begin case (execute_if[0].data.op_type) - `INST_SFU_CSRRW, - `INST_SFU_CSRRS, - `INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready; + `INST_SFU_CSRRW, + `INST_SFU_CSRRS, + `INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready; default: sfu_req_ready = wctl_execute_if.ready; endcase - end + end assign execute_if[0].ready = sfu_req_ready; // response arbitration @@ -194,19 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( .commit_out_if (commit_if) ); -`ifdef PERF_ENABLE - reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls; - - wire wctl_execute_stall = wctl_execute_if.valid && ~wctl_execute_if.ready; - - always @(posedge clk) begin - if (reset) begin - perf_wctl_stalls <= '0; - end else begin - perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_stall); - end - end - assign sfu_perf_if.wctl_stalls = perf_wctl_stalls; -`endif - endmodule diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 2ae0f678..7d421875 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -18,7 +18,8 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] sched_stalls; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; - wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; + wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS]; + wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS]; wire [`PERF_CTR_BITS-1:0] ifetches; wire [`PERF_CTR_BITS-1:0] loads; @@ -34,7 +35,8 @@ interface VX_pipeline_perf_if (); modport issue ( output ibf_stalls, output scb_stalls, - output scb_uses + output units_uses, + output sfu_uses ); modport slave ( @@ -42,7 +44,8 @@ interface VX_pipeline_perf_if (); input sched_stalls, input ibf_stalls, input scb_stalls, - input scb_uses, + input units_uses, + input sfu_uses, input ifetches, input loads, input stores, diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp index 5f472c84..104a2795 100644 --- a/runtime/common/utils.cpp +++ b/runtime/common/utils.cpp @@ -208,6 +208,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t scrb_fpu = 0; uint64_t scrb_lsu = 0; uint64_t scrb_sfu = 0; + uint64_t scrb_wctl = 0; + uint64_t scrb_csrs = 0; uint64_t ifetches = 0; uint64_t loads = 0; uint64_t stores = 0; @@ -268,44 +270,69 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { // PERF: pipeline // scheduler idles { - uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID); - int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler idles=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core); + uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID); + if (num_cores > 1) { + int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core); + fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core); + } sched_idles += sched_idles_per_core; } // scheduler stalls { - uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST); - int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core); + uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST); + if (num_cores > 1) { + int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core); + fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core); + } sched_stalls += sched_stalls_per_core; } // ibuffer_stalls { - uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); - int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core); + uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); + if (num_cores > 1) { + int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core); + fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core); + } ibuffer_stalls += ibuffer_stalls_per_core; } - // scrb_stalls + // issue_stalls { uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST); uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU); uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU); uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU); - uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU); - uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core; + uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU); scrb_alu += scrb_alu_per_core; scrb_fpu += scrb_fpu_per_core; scrb_lsu += scrb_lsu_per_core; scrb_sfu += scrb_sfu_per_core; - if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, + if (num_cores > 1) { + uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core; + fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, calcAvgPercent(scrb_alu_per_core, scrb_total), calcAvgPercent(scrb_fpu_per_core, scrb_total), calcAvgPercent(scrb_lsu_per_core, scrb_total), calcAvgPercent(scrb_sfu_per_core, scrb_total)); + } scrb_stalls += scrb_stalls_per_core; } + // sfu_stalls + { + uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU); + uint64_t scrb_wctl_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_WCTL); + uint64_t scrb_csrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_CSRS); + if (num_cores > 1) { + uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core + scrb_tex_per_core + scrb_raster_per_core + scrb_om_per_core; + fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n" + , core_id + , scrb_sfu_per_core + , calcAvgPercent(scrb_csrs_per_core, sfu_total) + , calcAvgPercent(scrb_wctl_per_core, sfu_total) + ); + } + scrb_wctl += scrb_wctl_per_core; + scrb_csrs += scrb_csrs_per_core; + } // PERF: memory // ifetches { @@ -313,9 +340,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); ifetches += ifetches_per_core; - uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT); - int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); + uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT); + if (num_cores > 1) { + int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core); + fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); + } ifetch_lat += ifetch_lat_per_core; } // loads @@ -324,9 +353,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); loads += loads_per_core; - uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT); - int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); + uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT); + if (num_cores > 1) { + int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core); + fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); + } load_lat += load_lat_per_core; } // stores @@ -428,14 +459,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches)); int load_avg_lat = (int)(double(load_lat) / double(loads)); uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu; - fprintf(stream, "PERF: scheduler idles=%ld (%d%%)\n", sched_idles, sched_idles_percent); + uint64_t sfu_total = scrb_wctl + scrb_csrs; + fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent); fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent); fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent); - fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, + fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, calcAvgPercent(scrb_alu, scrb_total), calcAvgPercent(scrb_fpu, scrb_total), calcAvgPercent(scrb_lsu, scrb_total), - calcAvgPercent(scrb_sfu, scrb_total)); + calcAvgPercent(scrb_sfu, scrb_total)); + fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n" + , scrb_sfu + , calcAvgPercent(scrb_csrs, sfu_total) + , calcAvgPercent(scrb_wctl, sfu_total) + ); fprintf(stream, "PERF: ifetches=%ld\n", ifetches); fprintf(stream, "PERF: loads=%ld\n", loads); fprintf(stream, "PERF: stores=%ld\n", stores); diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index 7f690fb6..3ac80cb6 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -18,20 +18,20 @@ using namespace vortex; Cluster::Cluster(const SimContext& ctx, uint32_t cluster_id, ProcessorImpl* processor, - const Arch &arch, const - DCRS &dcrs) + const Arch &arch, + const DCRS &dcrs) : SimObject(ctx, "cluster") , mem_req_port(this) , mem_rsp_port(this) , cluster_id_(cluster_id) - , sockets_(NUM_SOCKETS) - , barriers_(arch.num_barriers(), 0) , processor_(processor) + , sockets_(NUM_SOCKETS) + , barriers_(arch.num_barriers(), 0) , cores_per_socket_(arch.socket_size()) { char sname[100]; - auto sockets_per_cluster = sockets_.size(); + uint32_t sockets_per_cluster = sockets_.size(); // create sockets @@ -43,7 +43,10 @@ Cluster::Cluster(const SimContext& ctx, for (uint32_t i = 0; i < sockets_per_cluster; ++i) { uint32_t socket_id = cluster_id * sockets_per_cluster + i; - auto socket = Socket::Create(socket_id, this, arch, dcrs); + auto socket = Socket::Create(socket_id, + this, + arch, + dcrs); socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i)); icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port); @@ -154,7 +157,7 @@ void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) { } Cluster::PerfStats Cluster::perf_stats() const { - Cluster::PerfStats perf; - perf.l2cache = l2cache_->perf_stats(); - return perf; + PerfStats perf_stats; + perf_stats.l2cache = l2cache_->perf_stats(); + return perf_stats; } \ No newline at end of file diff --git a/sim/simx/cluster.h b/sim/simx/cluster.h index 2547486d..81e93e10 100644 --- a/sim/simx/cluster.h +++ b/sim/simx/cluster.h @@ -17,6 +17,7 @@ #include "dcrs.h" #include "arch.h" #include "cache_cluster.h" +#include "shared_mem.h" #include "core.h" #include "socket.h" #include "constants.h" @@ -27,13 +28,8 @@ class ProcessorImpl; class Cluster : public SimObject { public: - struct PerfStats { + struct PerfStats { CacheSim::PerfStats l2cache; - - PerfStats& operator+=(const PerfStats& rhs) { - this->l2cache += rhs.l2cache; - return *this; - } }; SimPort mem_req_port; @@ -67,15 +63,15 @@ public: void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); - Cluster::PerfStats perf_stats() const; + PerfStats perf_stats() const; private: - uint32_t cluster_id_; - std::vector sockets_; - std::vector barriers_; - CacheSim::Ptr l2cache_; - ProcessorImpl* processor_; - uint32_t cores_per_socket_; + uint32_t cluster_id_; + ProcessorImpl* processor_; + std::vector sockets_; + std::vector barriers_; + CacheSim::Ptr l2cache_; + uint32_t cores_per_socket_; }; } // namespace vortex \ No newline at end of file diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 7a549ebd..1c155011 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -28,13 +28,18 @@ using namespace vortex; -Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs) +Core::Core(const SimContext& ctx, + uint32_t core_id, + Socket* socket, + const Arch &arch, + const DCRS &dcrs) : SimObject(ctx, "core") , icache_req_ports(1, this) , icache_rsp_ports(1, this) , dcache_req_ports(NUM_LSU_LANES, this) , dcache_rsp_ports(NUM_LSU_LANES, this) , core_id_(core_id) + , socket_(socket) , arch_(arch) , dcrs_(dcrs) , decoder_(arch) @@ -42,7 +47,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch & , barriers_(arch.num_barriers(), 0) , fcsrs_(arch.num_warps(), 0) , ibuffers_(arch.num_warps(), IBUF_SIZE) - , scoreboard_(arch_) + , scoreboard_(arch_) , operands_(ISSUE_WIDTH) , dispatchers_((uint32_t)ExeType::ExeTypeCount) , exe_units_((uint32_t)ExeType::ExeTypeCount) @@ -50,8 +55,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch & , fetch_latch_("fetch") , decode_latch_("decode") , pending_icache_(arch_.num_warps()) - , csrs_(arch.num_warps()) - , socket_(socket) + , csrs_(arch.num_warps()) , commit_arbs_(ISSUE_WIDTH) { char sname[100]; @@ -69,6 +73,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch & } // initialize shared memory + snprintf(sname, 100, "core%d-shared_mem", core_id); shared_mem_ = SharedMem::Create(sname, SharedMem::Config{ (1 << SMEM_LOG_SIZE), sizeof(Word), @@ -77,17 +82,17 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch & false }); for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) { - snprintf(sname, 100, "smem_demux%d_%d", core_id, i); - auto smem_demux = SMemDemux::Create(sname); - - smem_demux->ReqDC.bind(&dcache_req_ports.at(i)); - dcache_rsp_ports.at(i).bind(&smem_demux->RspDC); + snprintf(sname, 100, "core%d-smem_demux%d", core_id, i); + auto smem_demux = SMemDemux::Create(sname); + + smem_demux->ReqDC.bind(&dcache_req_ports.at(i)); + dcache_rsp_ports.at(i).bind(&smem_demux->RspDC); - smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i)); - shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM); + smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i)); + shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM); - smem_demuxs_.at(i) = smem_demux; - } + smem_demuxs_.at(i) = smem_demux; + } // initialize dispatchers dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES); @@ -103,7 +108,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch & // bind commit arbiters for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { - snprintf(sname, 100, "commit-arb%d", i); + snprintf(sname, 100, "core%d-commit-arb%d", core_id, i); auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1); for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) { exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j)); @@ -128,7 +133,7 @@ void Core::reset() { for (auto& exe_unit : exe_units_) { exe_unit->reset(); } - + for (auto& commit_arb : commit_arbs_) { commit_arb->reset(); } @@ -184,7 +189,7 @@ void Core::schedule() { } } if (scheduled_warp == -1) { - ++perf_stats_.sched_idles; + ++perf_stats_.sched_idle; return; } @@ -229,7 +234,7 @@ void Core::fetch() { mem_req.uuid = trace->uuid; icache_req_ports.at(0).send(mem_req, 2); DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); - fetch_latch_.pop(); + fetch_latch_.pop(); ++perf_stats_.ifetches; ++pending_ifetches_; } @@ -311,7 +316,21 @@ void Core::issue() { case ExeType::ALU: ++perf_stats_.scrb_alu; break; case ExeType::FPU: ++perf_stats_.scrb_fpu; break; case ExeType::LSU: ++perf_stats_.scrb_lsu; break; - case ExeType::SFU: ++perf_stats_.scrb_sfu; break; + case ExeType::SFU: { + ++perf_stats_.scrb_sfu; + switch (use.sfu_type) { + case SfuType::TMC: + case SfuType::WSPAWN: + case SfuType::SPLIT: + case SfuType::JOIN: + case SfuType::BAR: + case SfuType::PRED: ++perf_stats_.scrb_wctl; break; + case SfuType::CSRRW: + case SfuType::CSRRS: + case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break; + default: assert(false); + } + } break; default: assert(false); } } @@ -356,7 +375,6 @@ void Core::commit() { auto& commit_arb = commit_arbs_.at(i); if (commit_arb->Outputs.at(0).empty()) continue; - auto trace = commit_arb->Outputs.at(0).front(); // advance to commit stage @@ -558,8 +576,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { break; case VX_DCR_MPM_CLASS_CORE: { switch (addr) { - case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idles & 0xffffffff; - case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idles >> 32; + case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff; + case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32; case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff; case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32; case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff; @@ -574,6 +592,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32; case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff; case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32; + case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff; + case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32; + case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff; + case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32; case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff; case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff; @@ -588,6 +610,7 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { } break; case VX_DCR_MPM_CLASS_MEM: { auto proc_perf = socket_->cluster()->processor()->perf_stats(); + auto cluster_perf = socket_->cluster()->perf_stats(); auto socket_perf = socket_->perf_stats(); auto smem_perf = shared_mem_->perf_stats(); switch (addr) { @@ -611,18 +634,18 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff; case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32; - case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff; - case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32; - case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff; - case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32; - case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff; - case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32; - case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff; - case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32; - case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff; - case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32; - case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff; - case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32; + case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff; + case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32; + case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff; + case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32; + case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff; + case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32; + case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff; + case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32; + case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff; + case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32; + case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32; case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff; case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32; @@ -638,7 +661,7 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32; case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff; - case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; + case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff; case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32; case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff; @@ -652,6 +675,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32; } } break; + default: { + std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl; + std::abort(); + } break; } } else { std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl; diff --git a/sim/simx/core.h b/sim/simx/core.h index 343fdb31..0ccb5d02 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -49,7 +49,7 @@ public: struct PerfStats { uint64_t cycles; uint64_t instrs; - uint64_t sched_idles; + uint64_t sched_idle; uint64_t sched_stalls; uint64_t ibuf_stalls; uint64_t scrb_stalls; @@ -57,6 +57,8 @@ public: uint64_t scrb_fpu; uint64_t scrb_lsu; uint64_t scrb_sfu; + uint64_t scrb_wctl; + uint64_t scrb_csrs; uint64_t ifetches; uint64_t loads; uint64_t stores; @@ -66,7 +68,7 @@ public: PerfStats() : cycles(0) , instrs(0) - , sched_idles(0) + , sched_idle(0) , sched_stalls(0) , ibuf_stalls(0) , scrb_stalls(0) @@ -74,6 +76,8 @@ public: , scrb_fpu(0) , scrb_lsu(0) , scrb_sfu(0) + , scrb_wctl(0) + , scrb_csrs(0) , ifetches(0) , loads(0) , stores(0) @@ -88,7 +92,11 @@ public: std::vector> dcache_req_ports; std::vector> dcache_rsp_ports; - Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs); + Core(const SimContext& ctx, + uint32_t core_id, + Socket* socket, + const Arch &arch, + const DCRS &dcrs); ~Core(); @@ -158,6 +166,7 @@ private: void cout_flush(); uint32_t core_id_; + Socket* socket_; const Arch& arch_; const DCRS &dcrs_; @@ -193,10 +202,9 @@ private: PerfStats perf_stats_; - Socket* socket_; - std::vector commit_arbs_; + uint32_t commit_exe_; uint32_t ibuffer_idx_; friend class Warp; diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index 77021dbd..8e8c1062 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -113,6 +113,7 @@ void ProcessorImpl::reset() { perf_mem_writes_ = 0; perf_mem_latency_ = 0; perf_mem_pending_reads_ = 0; + } void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) { @@ -125,9 +126,6 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const { perf.mem_writes = perf_mem_writes_; perf.mem_latency = perf_mem_latency_; perf.l3cache = l3cache_->perf_stats(); - for (auto cluster : clusters_) { - perf.clusters += cluster->perf_stats(); - } return perf; } diff --git a/sim/simx/processor_impl.h b/sim/simx/processor_impl.h index 02d92e95..072247a7 100644 --- a/sim/simx/processor_impl.h +++ b/sim/simx/processor_impl.h @@ -24,17 +24,10 @@ namespace vortex { class ProcessorImpl { public: struct PerfStats { + CacheSim::PerfStats l3cache; uint64_t mem_reads; uint64_t mem_writes; uint64_t mem_latency; - CacheSim::PerfStats l3cache; - Cluster::PerfStats clusters; - - PerfStats() - : mem_reads(0) - , mem_writes(0) - , mem_latency(0) - {} }; ProcessorImpl(const Arch& arch); @@ -46,7 +39,7 @@ public: void write_dcr(uint32_t addr, uint32_t value); - ProcessorImpl::PerfStats perf_stats() const; + PerfStats perf_stats() const; private: @@ -55,7 +48,7 @@ private: const Arch& arch_; std::vector> clusters_; DCRS dcrs_; - MemSim::Ptr memsim_; + MemSim::Ptr memsim_; CacheSim::Ptr l3cache_; uint64_t perf_mem_reads_; uint64_t perf_mem_writes_; diff --git a/sim/simx/scoreboard.h b/sim/simx/scoreboard.h index 5c247b73..58dbc2fb 100644 --- a/sim/simx/scoreboard.h +++ b/sim/simx/scoreboard.h @@ -25,6 +25,7 @@ public: RegType reg_type; uint32_t reg_id; ExeType exe_type; + SfuType sfu_type; uint64_t uuid; }; @@ -62,7 +63,7 @@ public: if (used_iregs.test(r)) { uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer; auto owner = owners_.at(tag); - out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid}); + out.push_back({RegType::Integer, r, owner->exe_type, owner->sfu_type, owner->uuid}); } } @@ -70,7 +71,7 @@ public: if (used_fregs.test(r)) { uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float; auto owner = owners_.at(tag); - out.push_back({RegType::Float, r, owner->exe_type, owner->uuid}); + out.push_back({RegType::Float, r, owner->exe_type, owner->sfu_type, owner->uuid}); } } @@ -78,7 +79,7 @@ public: if (used_vregs.test(r)) { uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector; auto owner = owners_.at(tag); - out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid}); + out.push_back({RegType::Vector, r, owner->exe_type, owner->sfu_type, owner->uuid}); } } diff --git a/sim/simx/socket.cpp b/sim/simx/socket.cpp index fb620d62..dd9f9697 100644 --- a/sim/simx/socket.cpp +++ b/sim/simx/socket.cpp @@ -19,16 +19,16 @@ using namespace vortex; Socket::Socket(const SimContext& ctx, uint32_t socket_id, Cluster* cluster, - const Arch &arch, const - DCRS &dcrs) + const Arch &arch, + const DCRS &dcrs) : SimObject(ctx, "socket") , icache_mem_req_port(this) , icache_mem_rsp_port(this) , dcache_mem_req_port(this) , dcache_mem_rsp_port(this) , socket_id_(socket_id) - , cores_(arch.socket_size()) , cluster_(cluster) + , cores_(arch.socket_size()) { auto cores_per_socket = cores_.size(); @@ -77,7 +77,10 @@ Socket::Socket(const SimContext& ctx, for (uint32_t i = 0; i < cores_per_socket; ++i) { uint32_t core_id = socket_id * cores_per_socket + i; - cores_.at(i) = Core::Create(core_id, this, arch, dcrs); + cores_.at(i) = Core::Create(core_id, + this, + arch, + dcrs); cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0)); icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0)); @@ -139,8 +142,8 @@ void Socket::resume(uint32_t core_index) { } Socket::PerfStats Socket::perf_stats() const { - Socket::PerfStats perf; - perf.icache = icaches_->perf_stats(); - perf.dcache = dcaches_->perf_stats(); - return perf; + PerfStats perf_stats; + perf_stats.icache = icaches_->perf_stats(); + perf_stats.dcache = dcaches_->perf_stats(); + return perf_stats; } \ No newline at end of file diff --git a/sim/simx/socket.h b/sim/simx/socket.h index 5c94c31f..5105f99e 100644 --- a/sim/simx/socket.h +++ b/sim/simx/socket.h @@ -30,12 +30,6 @@ public: struct PerfStats { CacheSim::PerfStats icache; CacheSim::PerfStats dcache; - - PerfStats& operator+=(const PerfStats& rhs) { - this->icache += rhs.icache; - this->dcache += rhs.dcache; - return *this; - } }; SimPort icache_mem_req_port; @@ -74,14 +68,14 @@ public: void resume(uint32_t core_id); - Socket::PerfStats perf_stats() const; + PerfStats perf_stats() const; private: - uint32_t socket_id_; + uint32_t socket_id_; + Cluster* cluster_; std::vector cores_; CacheCluster::Ptr icaches_; CacheCluster::Ptr dcaches_; - Cluster* cluster_; }; } // namespace vortex \ No newline at end of file diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index 5d18f9cd..92df373f 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -15,10 +15,10 @@ all: $(MAKE) -C blackscholes $(MAKE) -C transpose $(MAKE) -C convolution -# $(MAKE) -C cutcp -# $(MAKE) -C sgemm2 -# $(MAKE) -C vectorhypot -# $(MAKE) -C mri-q run-simx + $(MAKE) -C cutcp + $(MAKE) -C sgemm2 + $(MAKE) -C vectorhypot + $(MAKE) -C mri-q run-simx run-simx: $(MAKE) -C vecadd run-simx @@ -125,7 +125,7 @@ clean-all: $(MAKE) -C oclprintf clean-all $(MAKE) -C blackscholes clean-all $(MAKE) -C convolution clean-all -# $(MAKE) -C cutcp clean-all -# $(MAKE) -C sgemm2 clean-all -# $(MAKE) -C vectorhypot clean-all -# $(MAKE) -C mri-q clean-all + $(MAKE) -C cutcp clean-all + $(MAKE) -C sgemm2 clean-all + $(MAKE) -C vectorhypot clean-all + $(MAKE) -C mri-q clean-all From 36f5dd87fe4fcd34e1339df7eeb595ccbec82e09 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 28 Dec 2023 12:22:22 -0800 Subject: [PATCH 4/4] minor update --- tests/opencl/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index 92df373f..5d18f9cd 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -15,10 +15,10 @@ all: $(MAKE) -C blackscholes $(MAKE) -C transpose $(MAKE) -C convolution - $(MAKE) -C cutcp - $(MAKE) -C sgemm2 - $(MAKE) -C vectorhypot - $(MAKE) -C mri-q run-simx +# $(MAKE) -C cutcp +# $(MAKE) -C sgemm2 +# $(MAKE) -C vectorhypot +# $(MAKE) -C mri-q run-simx run-simx: $(MAKE) -C vecadd run-simx @@ -125,7 +125,7 @@ clean-all: $(MAKE) -C oclprintf clean-all $(MAKE) -C blackscholes clean-all $(MAKE) -C convolution clean-all - $(MAKE) -C cutcp clean-all - $(MAKE) -C sgemm2 clean-all - $(MAKE) -C vectorhypot clean-all - $(MAKE) -C mri-q clean-all +# $(MAKE) -C cutcp clean-all +# $(MAKE) -C sgemm2 clean-all +# $(MAKE) -C vectorhypot clean-all +# $(MAKE) -C mri-q clean-all