diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 37db6123..45426053 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -429,6 +429,9 @@ module VX_core import VX_gpu_pkg::*; #( // this will be a subset of scb_stalls $display("issue scoreboard: fires total:\t%d across ISSUE_WIDTH=%d", pipeline_perf_if.scb_fires, `ISSUE_WIDTH); + $display("issue scoreboard: cycles fired:\t%d (%.2f%%)", + pipeline_perf_if.scb_any_fire_cycles, + $itor(pipeline_perf_if.scb_any_fire_cycles) / $itor(cycles) * 100.0); $display("issue scoreboard: stalls total:\t%d across ISSUE_WIDTH=%d", pipeline_perf_if.scb_stalls, `ISSUE_WIDTH); $display("issue scoreboard: stalls by operand hazard: alu %d (%2.2f cycles per issue)", @@ -468,7 +471,7 @@ module VX_core import VX_gpu_pkg::*; #( pipeline_perf_if.dispatch_fires[`EX_LSU]); $display("issue dispatch: fires: sfu %d", pipeline_perf_if.dispatch_fires[`EX_SFU]); - $display("issue dispatch: cycles issued: %d (%.2f%%)", + $display("issue dispatch: cycles fired: %d (%.2f%%)", pipeline_perf_if.dispatch_any_fire_cycles, $itor(pipeline_perf_if.dispatch_any_fire_cycles) / $itor(cycles) * 100.0); $display("ifetches: %d", perf_ifetches); diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index fa7c99de..0700d077 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -236,12 +236,12 @@ module VX_dispatch import VX_gpu_pkg::*; #( `BUFFER(perf_unit_valids_per_cycle_r, perf_unit_valids_per_cycle); `BUFFER(perf_unit_fires_per_cycle_r, perf_unit_fires_per_cycle); - reg perf_any_fire; + reg perf_any_fire_per_cycle; always @(*) begin - perf_any_fire = 1'b0; + perf_any_fire_per_cycle = 1'b0; for (integer i = 0; i < `NUM_EX_UNITS; ++i) begin if (perf_unit_fires_per_cycle_r[i] != '0) begin - perf_any_fire = 1'b1; + perf_any_fire_per_cycle = 1'b1; end end end @@ -257,7 +257,7 @@ module VX_dispatch import VX_gpu_pkg::*; #( perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]); perf_valids_r[i] <= perf_valids_r[i] + `PERF_CTR_BITS'(perf_unit_valids_per_cycle_r[i]); perf_fires_r[i] <= perf_fires_r[i] + `PERF_CTR_BITS'(perf_unit_fires_per_cycle_r[i]); - perf_any_fire_cycles_r <= perf_any_fire_cycles_r + `PERF_CTR_BITS'(perf_any_fire); + perf_any_fire_cycles_r <= perf_any_fire_cycles_r + `PERF_CTR_BITS'(perf_any_fire_per_cycle); end end end diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index ef7cf31b..4e79ce70 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -62,6 +62,7 @@ module VX_issue #( `ifdef PERF_ENABLE .perf_scb_stalls(perf_issue_if.scb_stalls), .perf_scb_fires (perf_issue_if.scb_fires), + .perf_scb_any_fire_cycles (perf_issue_if.scb_any_fire_cycles), .perf_units_uses(perf_issue_if.units_uses), .perf_sfu_uses (perf_issue_if.sfu_uses), `endif diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 9d66d200..fe038fb5 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -22,6 +22,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls, output reg [`PERF_CTR_BITS-1:0] perf_scb_fires, + output reg [`PERF_CTR_BITS-1:0] perf_scb_any_fire_cycles, output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS], output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS], `endif @@ -47,9 +48,13 @@ module VX_scoreboard import VX_gpu_pkg::*; #( wire [`ISSUE_WIDTH-1:0] perf_issue_fires_per_cycle; wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_fires_per_cycle, perf_fires_per_cycle_r; + wire perf_any_fire_per_cycle, perf_any_fire_per_cycle_r; + + reg [`PERF_CTR_BITS-1:0] perf_scb_empty; `POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); - `POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle); + `POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle); + assign perf_any_fire_per_cycle = |perf_issue_fires_per_cycle; for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin always @(*) begin @@ -91,16 +96,19 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle); `BUFFER(perf_fires_per_cycle_r, perf_fires_per_cycle); + `BUFFER(perf_any_fire_per_cycle_r, perf_any_fire_per_cycle); `BUFFER(perf_units_per_cycle_r, perf_units_per_cycle); `BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle); always @(posedge clk) begin if (reset) begin perf_scb_stalls <= '0; - perf_scb_fires <= '0; + perf_scb_fires <= '0; + perf_scb_any_fire_cycles <= '0; end else begin perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r); perf_scb_fires <= perf_scb_fires + `PERF_CTR_BITS'(perf_fires_per_cycle_r); + perf_scb_any_fire_cycles <= perf_scb_any_fire_cycles + `PERF_CTR_BITS'(perf_any_fire_per_cycle_r); end end @@ -257,4 +265,19 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end +`ifdef PERF_ENABLE + wire [`ISSUE_WIDTH-1:0] ibuffer_valids; + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign ibuffer_valids[i] = ibuffer_if[i].valid; + end + + always @(posedge clk) begin + if (reset) begin + perf_scb_empty <= '0; + end else begin + perf_scb_empty <= perf_scb_empty + `PERF_CTR_BITS'(~|ibuffer_valids); + end + end +`endif + endmodule diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index afdca4fa..874778b8 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -20,6 +20,7 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] scb_fires; + wire [`PERF_CTR_BITS-1:0] scb_any_fire_cycles; wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS]; wire [`PERF_CTR_BITS-1:0] dispatch_stalls [`NUM_EX_UNITS]; @@ -43,6 +44,7 @@ interface VX_pipeline_perf_if (); output ibf_stalls, output scb_stalls, output scb_fires, + output scb_any_fire_cycles, output units_uses, output sfu_uses, output dispatch_stalls, @@ -58,6 +60,7 @@ interface VX_pipeline_perf_if (); input ibf_stalls, input scb_stalls, input scb_fires, + input scb_any_fire_cycles, input units_uses, input sfu_uses, input dispatch_stalls,