diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index c7b2a2c3..d57de0c3 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -436,7 +436,7 @@ // Number of Banks `ifndef DCACHE_NUM_BANKS -`define DCACHE_NUM_BANKS (`NUM_LSU_LANES) +`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4) `endif // Core Response Queue Size diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index efd719be..087c4080 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -174,30 +174,38 @@ module VX_dispatch import VX_gpu_pkg::*; #( || (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU)); end -`ifdef PERF_ENABLE - reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r; - wire [`ISSUE_WIDTH-1:0] operands_stall; - wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type; +`ifdef PERF_ENABLE + wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r; + reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle; + reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r; for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin - assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready; - assign operands_ex_type[i] = operands_if[i].data.ex_type; - end - - always @(*) begin - perf_stalls_n = perf_stalls_r; - for (integer i=0; i < `ISSUE_WIDTH; ++i) begin - if (operands_stall[i]) begin - perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1); + always @(*) begin + perf_issue_unit_stalls_per_cycle[i] = '0; + if (operands_if[i].valid && ~operands_if[i].ready) begin + perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1; end end end - always @(posedge clk) begin - if (reset) begin - perf_stalls_r <= '0; - end else begin - perf_stalls_r <= perf_stalls_n; + VX_reduce #( + .DATAW_IN (`NUM_EX_UNITS), + .N (`ISSUE_WIDTH), + .OP ("|") + ) reduce ( + .data_in (perf_issue_unit_stalls_per_cycle), + .data_out (perf_unit_stalls_per_cycle) + ); + + `BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle); + + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin + always @(posedge clk) begin + if (reset) begin + perf_stalls_r[i] <= '0; + end else begin + perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]); + end end end diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index c1d09c07..2206df25 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -32,19 +32,20 @@ module VX_scoreboard import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1; `ifdef PERF_ENABLE - wire [`NUM_EX_UNITS-1:0] scoreboard_uses_per_cycle; - wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle; - reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] scoreboard_uses; - wire [`ISSUE_WIDTH-1:0] scoreboard_stalls; + wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle; + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle; + reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_uses_per_cycle; + wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle; + + `POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); - `POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls); VX_reduce #( .DATAW_IN (`NUM_EX_UNITS), .N (`ISSUE_WIDTH), .OP ("|") ) reduce ( - .data_in (scoreboard_uses), - .data_out (scoreboard_uses_per_cycle) + .data_in (perf_issue_uses_per_cycle), + .data_out (perf_uses_per_cycle) ); `endif @@ -62,23 +63,23 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units; always @(*) begin - scoreboard_uses[i] = '0; + perf_issue_uses_per_cycle[i] = '0; if (ibuffer_if[i].valid) begin if (inuse_rd) begin - scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; + perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; end if (inuse_rs1) begin - scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; + perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; end if (inuse_rs2) begin - scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; + perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; end if (inuse_rs3) begin - scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; + perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; end end end - assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready; + assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready; `endif reg [DATAW-1:0] data_out_r; @@ -164,19 +165,26 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end `ifdef PERF_ENABLE + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle_r; + wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle_r; + + `BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle); + `BUFFER(perf_uses_per_cycle_r, perf_uses_per_cycle); + always @(posedge clk) begin if (reset) begin - perf_scb_stalls <= '0; + perf_scb_stalls <= '0; end else begin - perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle); + perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r); end end + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin always @(posedge clk) begin if (reset) begin perf_scb_uses[i] <= '0; end else begin - perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(scoreboard_uses_per_cycle[i]); + perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(perf_uses_per_cycle_r[i]); end end end diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index 7c1f0f7a..fbae5c7a 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -183,21 +183,23 @@ module VX_stream_xbar #( per_cycle_collision = 0; for (integer i = 0; i < NUM_INPUTS; ++i) begin for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin - if (valid_in[i] && valid_in[j+i] && sel_in[i] == sel_in[j+i]) begin - per_cycle_collision[i] |= ready_in[i] | ready_in[j+i]; - end + per_cycle_collision[i] |= valid_in[i] + && valid_in[j+i] + && (sel_in[i] == sel_in[j+i]) + && (ready_in[i] | ready_in[j+i]); end end end - wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count; + wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count, collision_count_r; `POP_COUNT(collision_count, per_cycle_collision); + `BUFFER(collision_count_r, collision_count); always @(posedge clk) begin if (reset) begin collisions_r <= '0; end else begin - collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count); + collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count_r); end end diff --git a/hw/rtl/mem/VX_shared_mem.sv b/hw/rtl/mem/VX_shared_mem.sv index a44c68a8..1c25c7cf 100644 --- a/hw/rtl/mem/VX_shared_mem.sv +++ b/hw/rtl/mem/VX_shared_mem.sv @@ -245,14 +245,19 @@ module VX_shared_mem import VX_gpu_pkg::*; #( reg [`PERF_CTR_BITS-1:0] perf_writes; reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; + wire [`CLOG2(NUM_REQS+1)-1:0] perf_reads_per_cycle_r; + wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle_r; + `BUFFER(perf_reads_per_cycle_r, perf_reads_per_cycle); + `BUFFER(perf_writes_per_cycle_r, perf_writes_per_cycle); + always @(posedge clk) begin if (reset) begin perf_reads <= '0; perf_writes <= '0; perf_crsp_stalls <= '0; end else begin - perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle); - perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle); + perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle_r); + perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle_r); perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end