Add perf counter for barrier schedule stalls
This commit is contained in:
@@ -350,6 +350,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
assign scheduler_idles = pipeline_perf_if.sched_idles;
|
assign scheduler_idles = pipeline_perf_if.sched_idles;
|
||||||
int scheduler_stalls;
|
int scheduler_stalls;
|
||||||
assign scheduler_stalls = pipeline_perf_if.sched_stalls;
|
assign scheduler_stalls = pipeline_perf_if.sched_stalls;
|
||||||
|
int scheduler_barrier_stalls;
|
||||||
|
assign scheduler_barrier_stalls = pipeline_perf_if.sched_barrier_stalls;
|
||||||
int ibuf_stalls;
|
int ibuf_stalls;
|
||||||
assign ibuf_stalls = pipeline_perf_if.ibf_stalls;
|
assign ibuf_stalls = pipeline_perf_if.ibf_stalls;
|
||||||
int scrb_alu_per_core;
|
int scrb_alu_per_core;
|
||||||
@@ -395,6 +397,10 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
$itor(scheduler_idles) / $itor(cycles) * 100.0);
|
$itor(scheduler_idles) / $itor(cycles) * 100.0);
|
||||||
$display("scheduler stalls: %d cycles (%f%%)", pipeline_perf_if.sched_stalls,
|
$display("scheduler stalls: %d cycles (%f%%)", pipeline_perf_if.sched_stalls,
|
||||||
$itor(scheduler_stalls) / $itor(cycles) * 100.0);
|
$itor(scheduler_stalls) / $itor(cycles) * 100.0);
|
||||||
|
$display("scheduler barrier stalls: %d count across NUM_WARPS=%d (%f%%)",
|
||||||
|
pipeline_perf_if.sched_barrier_stalls,
|
||||||
|
`NUM_WARPS,
|
||||||
|
$itor(scheduler_barrier_stalls) / $itor(cycles) * 100.0);
|
||||||
$display("ibuffer stalls: %d cycles (%f%%)",pipeline_perf_if.ibf_stalls,
|
$display("ibuffer stalls: %d cycles (%f%%)",pipeline_perf_if.ibf_stalls,
|
||||||
$itor(ibuf_stalls) / $itor(cycles) * 100.0);
|
$itor(ibuf_stalls) / $itor(cycles) * 100.0);
|
||||||
// see VX_scoreboard.sv
|
// see VX_scoreboard.sv
|
||||||
|
|||||||
@@ -166,6 +166,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
// don't check req_id == rsp_id, otherwise it limits us to
|
// don't check req_id == rsp_id, otherwise it limits us to
|
||||||
// 1 outstanding request. instead assume that any response coming
|
// 1 outstanding request. instead assume that any response coming
|
||||||
// back contains a valid id
|
// back contains a valid id
|
||||||
|
//
|
||||||
|
// NOTE(hansung): Because every response is broadcasted to all cores,
|
||||||
|
// this doesn't work when cores in the cluster use different sets of
|
||||||
|
// IDs. Need a way to keep track of in-use barriers for each core and
|
||||||
|
// validate responses accordingly.
|
||||||
if (gbar_bus_if.rsp_valid) begin
|
if (gbar_bus_if.rsp_valid) begin
|
||||||
barrier_masks_n[gbar_bus_if.rsp_id] = '0;
|
barrier_masks_n[gbar_bus_if.rsp_id] = '0;
|
||||||
// instead of unlocking all warps, only unlock those that
|
// instead of unlocking all warps, only unlock those that
|
||||||
@@ -408,22 +413,28 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||||
|
reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_stalls;
|
||||||
|
|
||||||
wire schedule_idle = ~schedule_valid;
|
wire schedule_idle = ~schedule_valid;
|
||||||
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
||||||
|
wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_stall;
|
||||||
|
`POP_COUNT(schedule_barrier_stall, barrier_stalls);
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
perf_sched_idles <= '0;
|
perf_sched_idles <= '0;
|
||||||
perf_sched_stalls <= '0;
|
perf_sched_stalls <= '0;
|
||||||
|
perf_sched_barrier_stalls <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
||||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||||
|
perf_sched_barrier_stalls <= perf_sched_barrier_stalls + `PERF_CTR_BITS'(schedule_barrier_stall);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign perf_schedule_if.sched_idles = perf_sched_idles;
|
assign perf_schedule_if.sched_idles = perf_sched_idles;
|
||||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||||
|
assign perf_schedule_if.sched_barrier_stalls = perf_sched_barrier_stalls;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
interface VX_pipeline_perf_if ();
|
interface VX_pipeline_perf_if ();
|
||||||
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
||||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||||
|
wire [`PERF_CTR_BITS-1:0] sched_barrier_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
|
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
|
||||||
@@ -29,6 +30,7 @@ interface VX_pipeline_perf_if ();
|
|||||||
|
|
||||||
modport schedule (
|
modport schedule (
|
||||||
output sched_idles,
|
output sched_idles,
|
||||||
|
output sched_barrier_stalls,
|
||||||
output sched_stalls
|
output sched_stalls
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -41,6 +43,7 @@ interface VX_pipeline_perf_if ();
|
|||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
input sched_idles,
|
input sched_idles,
|
||||||
|
input sched_barrier_stalls,
|
||||||
input sched_stalls,
|
input sched_stalls,
|
||||||
input ibf_stalls,
|
input ibf_stalls,
|
||||||
input scb_stalls,
|
input scb_stalls,
|
||||||
|
|||||||
Reference in New Issue
Block a user