profiling timing optimization

minor update

minor update

minor update
This commit is contained in:
Blaise Tine
2023-12-13 18:04:12 -08:00
parent f5f9e3dfdb
commit c6845a4c8d
11 changed files with 64 additions and 56 deletions

View File

@@ -266,8 +266,8 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_per_cycle_r;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_per_cycle_r;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [1:0] perf_icache_pending_read_cycle;
@@ -283,7 +283,9 @@ module VX_core import VX_gpu_pkg::*; #(
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
@@ -291,15 +293,15 @@ module VX_core import VX_gpu_pkg::*; #(
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
`BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
`BUFFER(perf_dcache_rd_req_per_cycle_r, perf_dcache_rd_req_per_cycle);
`BUFFER(perf_dcache_wr_req_per_cycle_r, perf_dcache_wr_req_per_cycle);
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle_r - perf_dcache_rsp_per_cycle;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
always @(posedge clk) begin
if (reset) begin
@@ -323,8 +325,8 @@ module VX_core import VX_gpu_pkg::*; #(
perf_dcache_lat <= '0;
end else begin
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle_r);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle_r);
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
end

View File

@@ -156,13 +156,14 @@ module VX_issue #(
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
if (decode_if.valid && ~decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
end
end

View File

@@ -383,13 +383,16 @@ module VX_schedule import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_sched_stalls <= '0;
perf_fetch_stalls <= '0;
end else begin
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid);
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready);
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(~schedule_valid);
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_stall);
end
end

View File

@@ -196,11 +196,14 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
wire wctl_execute_stall = wctl_execute_if.valid && ~wctl_execute_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_wctl_stalls <= '0;
end else begin
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready);
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_stall);
end
end
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;

View File

@@ -49,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [LANE_BITS-1:0] tid;
wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin