scoreboard optimization & profiling
This commit is contained in:
@@ -44,7 +44,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
VX_commit_if commit_if[`ISSUE_WIDTH]();
|
VX_commit_if commit_if[`ISSUE_WIDTH]();
|
||||||
|
|
||||||
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
||||||
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
|
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
|
||||||
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
|
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
|
||||||
wire [`ISSUE_WIDTH-1:0] commit_eop;
|
wire [`ISSUE_WIDTH-1:0] commit_eop;
|
||||||
@@ -91,24 +91,22 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
`UNUSED_PIN (sel_out)
|
`UNUSED_PIN (sel_out)
|
||||||
);
|
);
|
||||||
|
|
||||||
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
||||||
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
||||||
assign commit_wid[i] = commit_if[i].data.wid;
|
assign commit_wid[i] = commit_if[i].data.wid;
|
||||||
assign commit_eop[i] = commit_if[i].data.eop;
|
assign commit_eop[i] = commit_if[i].data.eop;
|
||||||
end
|
end
|
||||||
|
|
||||||
// CSRs update
|
// CSRs update
|
||||||
|
|
||||||
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
|
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
|
||||||
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
|
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
|
||||||
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
|
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
|
||||||
|
|
||||||
assign commit_fire_any = (| commit_fire);
|
assign commit_fire_any = (| commit_fire);
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
wire [COMMIT_SIZEW-1:0] pop_count;
|
`POP_COUNT(commit_size[i], commit_tmask[i]);
|
||||||
`POP_COUNT(pop_count, commit_tmask[i]);
|
|
||||||
assign commit_size[i] = pop_count;
|
|
||||||
end
|
end
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
@@ -129,7 +127,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
.OP ("+")
|
.OP ("+")
|
||||||
) commit_size_reduce (
|
) commit_size_reduce (
|
||||||
.data_in (commit_size_r),
|
.data_in (commit_size_r),
|
||||||
.data_out (commit_size_all)
|
.data_out (commit_size_all_r)
|
||||||
);
|
);
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
@@ -139,26 +137,26 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.enable (1'b1),
|
.enable (1'b1),
|
||||||
.data_in ({commit_fire_any_r, commit_size_all}),
|
.data_in ({commit_fire_any_r, commit_size_all_r}),
|
||||||
.data_out ({commit_fire_any_rr, commit_size_all_r})
|
.data_out ({commit_fire_any_rr, commit_size_all_rr})
|
||||||
);
|
);
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] instret;
|
reg [`PERF_CTR_BITS-1:0] instret;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
instret <= '0;
|
instret <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
if (commit_fire_any_rr) begin
|
if (commit_fire_any_rr) begin
|
||||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
|
instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign commit_csr_if.instret = instret;
|
assign commit_csr_if.instret = instret;
|
||||||
|
|
||||||
// Committed instructions
|
// Committed instructions
|
||||||
|
|
||||||
|
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
||||||
.RESETW (`ISSUE_WIDTH)
|
.RESETW (`ISSUE_WIDTH)
|
||||||
@@ -166,23 +164,23 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.enable (1'b1),
|
.enable (1'b1),
|
||||||
.data_in ({(commit_fire & commit_eop), commit_wid}),
|
.data_in ({committed, commit_wid}),
|
||||||
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
|
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
|
||||||
);
|
);
|
||||||
|
|
||||||
// Writeback
|
// Writeback
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||||
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
||||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||||
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
|
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
|
||||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||||
assign writeback_if[i].data.data = commit_if[i].data.data;
|
assign writeback_if[i].data.data = commit_if[i].data.data;
|
||||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||||
assign commit_if[i].ready = 1'b1;
|
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
|
||||||
end
|
end
|
||||||
|
|
||||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||||
|
|||||||
@@ -116,7 +116,11 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
.CORE_ID (CORE_ID)
|
.CORE_ID (CORE_ID)
|
||||||
) schedule (
|
) schedule (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (schedule_reset),
|
.reset (schedule_reset),
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
.perf_schedule_if (pipeline_perf_if.schedule),
|
||||||
|
`endif
|
||||||
|
|
||||||
.base_dcrs (base_dcrs),
|
.base_dcrs (base_dcrs),
|
||||||
|
|
||||||
|
|||||||
@@ -179,14 +179,18 @@ import VX_fpu_pkg::*;
|
|||||||
|
|
||||||
default: begin
|
default: begin
|
||||||
read_addr_valid_r = 0;
|
read_addr_valid_r = 0;
|
||||||
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|
||||||
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
|
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
|
||||||
read_addr_valid_r = 1;
|
read_addr_valid_r = 1;
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
case (base_dcrs.mpm_class)
|
case (base_dcrs.mpm_class)
|
||||||
`VX_DCR_MPM_CLASS_CORE: begin
|
`VX_DCR_MPM_CLASS_CORE: begin
|
||||||
case (read_addr)
|
case (read_addr)
|
||||||
// PERF: pipeline
|
// PERF: pipeline
|
||||||
|
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||||
|
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0];
|
||||||
|
`VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||||
@@ -204,6 +208,19 @@ import VX_fpu_pkg::*;
|
|||||||
`endif
|
`endif
|
||||||
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
|
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
|
||||||
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
|
||||||
|
`ifdef EXT_F_ENABLE
|
||||||
|
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0];
|
||||||
|
`else
|
||||||
|
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||||
|
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||||
|
`endif
|
||||||
|
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
|
||||||
|
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||||
@@ -214,7 +231,7 @@ import VX_fpu_pkg::*;
|
|||||||
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||||
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||||
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
@@ -225,6 +242,8 @@ import VX_fpu_pkg::*;
|
|||||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||||
|
`VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: dcache
|
// PERF: dcache
|
||||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
|
|||||||
@@ -59,6 +59,10 @@ module VX_issue #(
|
|||||||
) scoreboard (
|
) scoreboard (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (scoreboard_reset),
|
.reset (scoreboard_reset),
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
||||||
|
.perf_scb_uses (perf_issue_if.scb_uses),
|
||||||
|
`endif
|
||||||
.writeback_if (writeback_if),
|
.writeback_if (writeback_if),
|
||||||
.ibuffer_if (ibuffer_if),
|
.ibuffer_if (ibuffer_if),
|
||||||
.scoreboard_if (scoreboard_if)
|
.scoreboard_if (scoreboard_if)
|
||||||
@@ -152,29 +156,17 @@ module VX_issue #(
|
|||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
|
|
||||||
|
|
||||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
|
|
||||||
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
|
|
||||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
|
||||||
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
|
||||||
end
|
|
||||||
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
|
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
perf_ibf_stalls <= '0;
|
perf_ibf_stalls <= '0;
|
||||||
perf_scb_stalls <= '0;
|
|
||||||
end else begin
|
end else begin
|
||||||
if (decode_if.valid && ~decode_if.ready) begin
|
if (decode_if.valid && ~decode_if.ready) begin
|
||||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
|
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
|
||||||
end
|
end
|
||||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
||||||
assign perf_issue_if.scb_stalls = perf_scb_stalls;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
VX_pipeline_perf_if.schedule perf_schedule_if,
|
||||||
|
`endif
|
||||||
|
|
||||||
// configuration
|
// configuration
|
||||||
input base_dcrs_t base_dcrs,
|
input base_dcrs_t base_dcrs,
|
||||||
|
|
||||||
@@ -376,4 +380,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
end
|
end
|
||||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||||
|
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_sched_stalls <= '0;
|
||||||
|
perf_fetch_stalls <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid);
|
||||||
|
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||||
|
assign perf_schedule_if.fetch_stalls = perf_fetch_stalls;
|
||||||
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -19,6 +19,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
||||||
|
output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS],
|
||||||
|
`endif
|
||||||
|
|
||||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||||
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
|
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
|
||||||
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
|
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
|
||||||
@@ -26,81 +31,102 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
`UNUSED_PARAM (CORE_ID)
|
`UNUSED_PARAM (CORE_ID)
|
||||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_alu_per_cycle;
|
||||||
|
`ifdef EXT_F_ENABLE
|
||||||
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_fpu_per_cycle;
|
||||||
|
`endif
|
||||||
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_lsu_per_cycle;
|
||||||
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_sfu_per_cycle;
|
||||||
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
|
||||||
|
reg [`EX_BITS-1:0][`ISSUE_WIDTH-1:0] scoreboard_uses;
|
||||||
|
wire [`ISSUE_WIDTH-1:0] scoreboard_stalls;
|
||||||
|
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
|
||||||
|
`POP_COUNT(scoreboard_alu_per_cycle, scoreboard_uses[`EX_ALU]);
|
||||||
|
`ifdef EXT_F_ENABLE
|
||||||
|
`POP_COUNT(scoreboard_fpu_per_cycle, scoreboard_uses[`EX_FPU]);
|
||||||
|
`endif
|
||||||
|
`POP_COUNT(scoreboard_lsu_per_cycle, scoreboard_uses[`EX_LSU]);
|
||||||
|
`POP_COUNT(scoreboard_sfu_per_cycle, scoreboard_uses[`EX_SFU]);
|
||||||
|
`endif
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
|
||||||
reg [3:0] ready_masks, ready_masks_n;
|
|
||||||
VX_ibuffer_if staging_if();
|
VX_ibuffer_if staging_if();
|
||||||
|
|
||||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||||
|
|
||||||
|
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
|
||||||
|
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
|
||||||
|
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
|
||||||
|
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units;
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
inuse_regs_n = inuse_regs;
|
scoreboard_uses = '0;
|
||||||
ready_masks_n = ready_masks;
|
if (ibuffer_if[i].valid) begin
|
||||||
if (writeback_fire) begin
|
if (inuse_rd) begin
|
||||||
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0;
|
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]][i] = 1;
|
||||||
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
|
end
|
||||||
& {(writeback_if[i].data.rd == staging_if.data.rd),
|
if (inuse_rs1) begin
|
||||||
(writeback_if[i].data.rd == staging_if.data.rs1),
|
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]][i] = 1;
|
||||||
(writeback_if[i].data.rd == staging_if.data.rs2),
|
end
|
||||||
(writeback_if[i].data.rd == staging_if.data.rs3)};
|
if (inuse_rs2) begin
|
||||||
end
|
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]][i] = 1;
|
||||||
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin
|
end
|
||||||
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1;
|
if (inuse_rs3) begin
|
||||||
ready_masks_n = '0;
|
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]][i] = 1;
|
||||||
|
end
|
||||||
end
|
end
|
||||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
end
|
||||||
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd],
|
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1],
|
`endif
|
||||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2],
|
|
||||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]};
|
reg [DATAW-1:0] data_out_r;
|
||||||
end
|
reg valid_out_r;
|
||||||
end
|
|
||||||
|
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||||
|
wire deps_ready = (& ready_masks);
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
inuse_regs <= '0;
|
valid_out_r <= 0;
|
||||||
ready_masks <= '0;
|
inuse_regs <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
inuse_regs <= inuse_regs_n;
|
if (writeback_fire) begin
|
||||||
ready_masks <= ready_masks_n;
|
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
|
||||||
|
end
|
||||||
|
if (~valid_out_r) begin
|
||||||
|
valid_out_r <= ibuffer_if[i].valid && deps_ready;
|
||||||
|
end else if (staging_if.ready) begin
|
||||||
|
if (staging_if.data.wb) begin
|
||||||
|
inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1;
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type;
|
||||||
|
`endif
|
||||||
|
end
|
||||||
|
valid_out_r <= 0;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if (~valid_out_r) begin
|
||||||
|
data_out_r <= ibuffer_if[i].data;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
// staging buffer
|
assign ibuffer_if[i].ready = ~valid_out_r && deps_ready;
|
||||||
|
assign staging_if.valid = valid_out_r;
|
||||||
`RESET_RELAY (stg_buf_reset, reset);
|
assign staging_if.data = data_out_r;
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
|
||||||
.DATAW (DATAW)
|
|
||||||
) stg_buf (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (stg_buf_reset),
|
|
||||||
.valid_in (ibuffer_if[i].valid),
|
|
||||||
.ready_in (ibuffer_if[i].ready),
|
|
||||||
.data_in (ibuffer_if[i].data),
|
|
||||||
.data_out (staging_if.data),
|
|
||||||
.valid_out (staging_if.valid),
|
|
||||||
.ready_out (staging_if.ready)
|
|
||||||
);
|
|
||||||
|
|
||||||
// output buffer
|
|
||||||
|
|
||||||
wire valid_stg, ready_stg;
|
|
||||||
wire regs_ready = (& ready_masks);
|
|
||||||
assign valid_stg = staging_if.valid && regs_ready;
|
|
||||||
assign staging_if.ready = ready_stg && regs_ready;
|
|
||||||
|
|
||||||
`RESET_RELAY (out_buf_reset, reset);
|
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
VX_elastic_buffer #(
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
.SIZE (2),
|
.SIZE (0),
|
||||||
.OUT_REG (2)
|
.OUT_REG (2)
|
||||||
) out_buf (
|
) out_buf (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (out_buf_reset),
|
.reset (reset),
|
||||||
.valid_in (valid_stg),
|
.valid_in (staging_if.valid),
|
||||||
.ready_in (ready_stg),
|
.ready_in (staging_if.ready),
|
||||||
.data_in (staging_if.data),
|
.data_in (staging_if.data),
|
||||||
.data_out (scoreboard_if[i].data),
|
.data_out (scoreboard_if[i].data),
|
||||||
.valid_out (scoreboard_if[i].valid),
|
.valid_out (scoreboard_if[i].valid),
|
||||||
@@ -108,29 +134,29 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
|
|
||||||
`ifdef SIMULATION
|
`ifdef SIMULATION
|
||||||
reg [31:0] timeout_ctr;
|
reg [31:0] timeout_ctr;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
timeout_ctr <= '0;
|
timeout_ctr <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
if (staging_if.valid && ~regs_ready) begin
|
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
|
||||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||||
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
||||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||||
~ready_masks, staging_if.data.uuid));
|
~ready_masks, ibuffer_if[i].data.uuid));
|
||||||
`endif
|
`endif
|
||||||
timeout_ctr <= timeout_ctr + 1;
|
timeout_ctr <= timeout_ctr + 1;
|
||||||
end else if (staging_if.valid && staging_if.ready) begin
|
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||||
timeout_ctr <= '0;
|
timeout_ctr <= '0;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
||||||
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
||||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||||
~ready_masks, staging_if.data.uuid));
|
~ready_masks, ibuffer_if[i].data.uuid));
|
||||||
|
|
||||||
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
|
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
|
||||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||||
@@ -139,4 +165,26 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_scb_stalls <= '0;
|
||||||
|
perf_scb_uses[`EX_ALU] <= '0;
|
||||||
|
`ifdef EXT_F_ENABLE
|
||||||
|
perf_scb_uses[`EX_FPU] <= '0;
|
||||||
|
`endif
|
||||||
|
perf_scb_uses[`EX_LSU] <= '0;
|
||||||
|
perf_scb_uses[`EX_SFU] <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
|
||||||
|
perf_scb_uses[`EX_ALU] <= perf_scb_uses[`EX_ALU] + `PERF_CTR_BITS'(scoreboard_alu_per_cycle);
|
||||||
|
`ifdef EXT_F_ENABLE
|
||||||
|
perf_scb_uses[`EX_FPU] <= perf_scb_uses[`EX_FPU] + `PERF_CTR_BITS'(scoreboard_fpu_per_cycle);
|
||||||
|
`endif
|
||||||
|
perf_scb_uses[`EX_LSU] <= perf_scb_uses[`EX_LSU] + `PERF_CTR_BITS'(scoreboard_lsu_per_cycle);
|
||||||
|
perf_scb_uses[`EX_SFU] <= perf_scb_uses[`EX_SFU] + `PERF_CTR_BITS'(scoreboard_sfu_per_cycle);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -14,8 +14,11 @@
|
|||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
|
||||||
interface VX_pipeline_perf_if ();
|
interface VX_pipeline_perf_if ();
|
||||||
|
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||||
|
wire [`PERF_CTR_BITS-1:0] fetch_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||||
|
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
||||||
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
|
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||||
@@ -24,15 +27,24 @@ interface VX_pipeline_perf_if ();
|
|||||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||||
|
|
||||||
|
modport schedule (
|
||||||
|
output sched_stalls,
|
||||||
|
output fetch_stalls
|
||||||
|
);
|
||||||
|
|
||||||
modport issue (
|
modport issue (
|
||||||
output ibf_stalls,
|
output ibf_stalls,
|
||||||
output scb_stalls,
|
output scb_stalls,
|
||||||
|
output scb_uses,
|
||||||
output dsp_stalls
|
output dsp_stalls
|
||||||
);
|
);
|
||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
|
input sched_stalls,
|
||||||
|
input fetch_stalls,
|
||||||
input ibf_stalls,
|
input ibf_stalls,
|
||||||
input scb_stalls,
|
input scb_stalls,
|
||||||
|
input scb_uses,
|
||||||
input dsp_stalls,
|
input dsp_stalls,
|
||||||
input ifetches,
|
input ifetches,
|
||||||
input loads,
|
input loads,
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ module VX_stream_xbar #(
|
|||||||
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
|
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
|
||||||
parameter ARBITER = "P",
|
parameter ARBITER = "P",
|
||||||
parameter LOCK_ENABLE = 0,
|
parameter LOCK_ENABLE = 0,
|
||||||
parameter OUT_REG = 0,
|
parameter OUT_REG = 0,
|
||||||
parameter MAX_FANOUT = `MAX_FANOUT,
|
parameter MAX_FANOUT = `MAX_FANOUT,
|
||||||
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
|
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
|
||||||
) (
|
) (
|
||||||
@@ -173,8 +173,8 @@ module VX_stream_xbar #(
|
|||||||
end
|
end
|
||||||
|
|
||||||
// compute inputs collision
|
// compute inputs collision
|
||||||
// we have a collision when there exists a valid transfer with mutiple input candicates
|
// we have a collision when there exists a valid transfer with multiple input candicates
|
||||||
// we caount the unique duplicates each cycle.
|
// we count the unique duplicates each cycle.
|
||||||
|
|
||||||
reg [PERF_CTR_BITS-1:0] collisions_r;
|
reg [PERF_CTR_BITS-1:0] collisions_r;
|
||||||
reg [NUM_INPUTS-1:0] per_cycle_collision;
|
reg [NUM_INPUTS-1:0] per_cycle_collision;
|
||||||
|
|||||||
Reference in New Issue
Block a user