diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index e93e5c93..dab5836c 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -350,6 +350,8 @@ module VX_core import VX_gpu_pkg::*; #( assign scheduler_idles = pipeline_perf_if.sched_idles; int scheduler_stalls; assign scheduler_stalls = pipeline_perf_if.sched_stalls; + int scheduler_barrier_stalls; + assign scheduler_barrier_stalls = pipeline_perf_if.sched_barrier_stalls; int ibuf_stalls; assign ibuf_stalls = pipeline_perf_if.ibf_stalls; int scrb_alu_per_core; @@ -395,6 +397,10 @@ module VX_core import VX_gpu_pkg::*; #( $itor(scheduler_idles) / $itor(cycles) * 100.0); $display("scheduler stalls: %d cycles (%f%%)", pipeline_perf_if.sched_stalls, $itor(scheduler_stalls) / $itor(cycles) * 100.0); + $display("scheduler barrier stalls: %d count across NUM_WARPS=%d (%f%%)", + pipeline_perf_if.sched_barrier_stalls, + `NUM_WARPS, + $itor(scheduler_barrier_stalls) / $itor(cycles) * 100.0); $display("ibuffer stalls: %d cycles (%f%%)",pipeline_perf_if.ibf_stalls, $itor(ibuf_stalls) / $itor(cycles) * 100.0); // see VX_scoreboard.sv diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 94ade0d1..c890a2f6 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -166,6 +166,11 @@ module VX_schedule import VX_gpu_pkg::*; #( // don't check req_id == rsp_id, otherwise it limits us to // 1 outstanding request. instead assume that any response coming // back contains a valid id + // + // NOTE(hansung): Because every response is broadcasted to all cores, + // this doesn't work when cores in the cluster use different sets of + // IDs. Need a way to keep track of in-use barriers for each core and + // validate responses accordingly. if (gbar_bus_if.rsp_valid) begin barrier_masks_n[gbar_bus_if.rsp_id] = '0; // instead of unlocking all warps, only unlock those that @@ -408,22 +413,28 @@ module VX_schedule import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_sched_idles; reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; + reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_stalls; wire schedule_idle = ~schedule_valid; wire schedule_stall = schedule_if.valid && ~schedule_if.ready; + wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_stall; + `POP_COUNT(schedule_barrier_stall, barrier_stalls); always @(posedge clk) begin if (reset) begin perf_sched_idles <= '0; perf_sched_stalls <= '0; + perf_sched_barrier_stalls <= '0; end else begin perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle); perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall); + perf_sched_barrier_stalls <= perf_sched_barrier_stalls + `PERF_CTR_BITS'(schedule_barrier_stall); end end assign perf_schedule_if.sched_idles = perf_sched_idles; assign perf_schedule_if.sched_stalls = perf_sched_stalls; + assign perf_schedule_if.sched_barrier_stalls = perf_sched_barrier_stalls; `endif endmodule diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 7d421875..661ebcdf 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -16,6 +16,7 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] sched_idles; wire [`PERF_CTR_BITS-1:0] sched_stalls; + wire [`PERF_CTR_BITS-1:0] sched_barrier_stalls; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS]; @@ -29,6 +30,7 @@ interface VX_pipeline_perf_if (); modport schedule ( output sched_idles, + output sched_barrier_stalls, output sched_stalls ); @@ -41,6 +43,7 @@ interface VX_pipeline_perf_if (); modport slave ( input sched_idles, + input sched_barrier_stalls, input sched_stalls, input ibf_stalls, input scb_stalls,