Merge remote-tracking branch 'upstream/master' into vortex2
This commit is contained in:
@@ -45,7 +45,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
|
||||
VX_commit_if commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
||||
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
||||
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
|
||||
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
|
||||
wire [`ISSUE_WIDTH-1:0] commit_eop;
|
||||
@@ -92,24 +92,24 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
||||
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
||||
assign commit_wid[i] = commit_if[i].data.wid;
|
||||
assign commit_eop[i] = commit_if[i].data.eop;
|
||||
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
||||
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
||||
assign commit_wid[i] = commit_if[i].data.wid;
|
||||
assign commit_eop[i] = commit_if[i].data.eop;
|
||||
end
|
||||
|
||||
// CSRs update
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
|
||||
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
|
||||
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
|
||||
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
|
||||
|
||||
assign commit_fire_any = (| commit_fire);
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
wire [COMMIT_SIZEW-1:0] pop_count;
|
||||
`POP_COUNT(pop_count, commit_tmask[i]);
|
||||
assign commit_size[i] = pop_count;
|
||||
wire [COMMIT_SIZEW-1:0] count;
|
||||
`POP_COUNT(count, commit_tmask[i]);
|
||||
assign commit_size[i] = count;
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
@@ -130,7 +130,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
.OP ("+")
|
||||
) commit_size_reduce (
|
||||
.data_in (commit_size_r),
|
||||
.data_out (commit_size_all)
|
||||
.data_out (commit_size_all_r)
|
||||
);
|
||||
|
||||
VX_pipe_register #(
|
||||
@@ -140,26 +140,26 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({commit_fire_any_r, commit_size_all}),
|
||||
.data_out ({commit_fire_any_rr, commit_size_all_r})
|
||||
.data_in ({commit_fire_any_r, commit_size_all_r}),
|
||||
.data_out ({commit_fire_any_rr, commit_size_all_rr})
|
||||
);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] instret;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instret <= '0;
|
||||
end else begin
|
||||
if (commit_fire_any_rr) begin
|
||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
|
||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign commit_csr_if.instret = instret;
|
||||
|
||||
// Committed instructions
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
||||
.RESETW (`ISSUE_WIDTH)
|
||||
@@ -167,23 +167,23 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({(commit_fire & commit_eop), commit_wid}),
|
||||
.data_in ({committed, commit_wid}),
|
||||
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
|
||||
);
|
||||
|
||||
// Writeback
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
|
||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
|
||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||
assign writeback_if[i].data.data = commit_if[i].data.data;
|
||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||
assign commit_if[i].ready = 1'b1;
|
||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
|
||||
end
|
||||
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
|
||||
@@ -1,339 +1,341 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
`endif
|
||||
|
||||
module VX_core import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
VX_mem_bus_if.master smem_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master icache_bus_if,
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
`endif
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
VX_schedule_if schedule_if();
|
||||
VX_fetch_if fetch_if();
|
||||
VX_decode_if decode_if();
|
||||
VX_sched_csr_if sched_csr_if();
|
||||
VX_decode_sched_if decode_sched_if();
|
||||
VX_commit_sched_if commit_sched_if();
|
||||
VX_commit_csr_if commit_csr_if();
|
||||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
||||
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
||||
`endif
|
||||
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
`ifdef SM_ENABLE
|
||||
cache_perf_t smem_perf;
|
||||
assign mem_perf_tmp_if.smem = smem_perf;
|
||||
`else
|
||||
assign mem_perf_tmp_if.smem = '0;
|
||||
`endif
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (dcr_data_reset, reset);
|
||||
`RESET_RELAY (schedule_reset, reset);
|
||||
`RESET_RELAY (fetch_reset, reset);
|
||||
`RESET_RELAY (decode_reset, reset);
|
||||
`RESET_RELAY (issue_reset, reset);
|
||||
`RESET_RELAY (execute_reset, reset);
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
base_dcrs_t base_dcrs;
|
||||
|
||||
VX_dcr_data dcr_data (
|
||||
.clk (clk),
|
||||
.reset (dcr_data_reset),
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
.base_dcrs (base_dcrs)
|
||||
);
|
||||
|
||||
`SCOPE_IO_SWITCH (3)
|
||||
|
||||
VX_schedule #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) schedule (
|
||||
.clk (clk),
|
||||
.reset (schedule_reset),
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.decode_sched_if(decode_sched_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.schedule_if (schedule_if),
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (gbar_bus_if),
|
||||
`endif
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
VX_fetch #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) fetch (
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (fetch_reset),
|
||||
.icache_bus_if (icache_bus_if),
|
||||
.schedule_if (schedule_if),
|
||||
.fetch_if (fetch_if)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
.fetch_if (fetch_if),
|
||||
.decode_if (decode_if),
|
||||
.decode_sched_if(decode_sched_if)
|
||||
);
|
||||
|
||||
VX_issue #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) issue (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_issue_if (pipeline_perf_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
`endif
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
|
||||
VX_execute #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) execute (
|
||||
`SCOPE_IO_BIND (2)
|
||||
|
||||
.clk (clk),
|
||||
.reset (execute_reset),
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
.dcache_bus_if (dcache_bus_tmp_if),
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
.sfu_dispatch_if(sfu_dispatch_if),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.lsu_commit_if (lsu_commit_if),
|
||||
.sfu_commit_if (sfu_commit_if),
|
||||
|
||||
.sim_ebreak (sim_ebreak)
|
||||
);
|
||||
|
||||
VX_commit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) commit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.lsu_commit_if (lsu_commit_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
.sfu_commit_if (sfu_commit_if),
|
||||
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.sim_wb_value (sim_wb_value)
|
||||
);
|
||||
|
||||
`ifdef SM_ENABLE
|
||||
|
||||
VX_smem_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) smem_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (smem_perf),
|
||||
`endif
|
||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||
.dcache_bus_out_if (dcache_bus_if),
|
||||
.smem_bus_out_if (smem_bus_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
|
||||
wire perf_icache_pending_read_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
|
||||
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
|
||||
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
|
||||
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
|
||||
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
|
||||
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
|
||||
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
|
||||
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
|
||||
|
||||
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
|
||||
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_icache_pending_reads <= '0;
|
||||
perf_dcache_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
||||
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ifetches <= '0;
|
||||
perf_loads <= '0;
|
||||
perf_stores <= '0;
|
||||
perf_icache_lat <= '0;
|
||||
perf_dcache_lat <= '0;
|
||||
end else begin
|
||||
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
||||
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
|
||||
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign pipeline_perf_if.ifetches = perf_ifetches;
|
||||
assign pipeline_perf_if.loads = perf_loads;
|
||||
assign pipeline_perf_if.stores = perf_stores;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
`endif
|
||||
|
||||
module VX_core import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
VX_mem_bus_if.master smem_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master icache_bus_if,
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
`endif
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
VX_schedule_if schedule_if();
|
||||
VX_fetch_if fetch_if();
|
||||
VX_decode_if decode_if();
|
||||
VX_sched_csr_if sched_csr_if();
|
||||
VX_decode_sched_if decode_sched_if();
|
||||
VX_commit_sched_if commit_sched_if();
|
||||
VX_commit_csr_if commit_csr_if();
|
||||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
||||
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
||||
`endif
|
||||
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (dcr_data_reset, reset);
|
||||
`RESET_RELAY (schedule_reset, reset);
|
||||
`RESET_RELAY (fetch_reset, reset);
|
||||
`RESET_RELAY (decode_reset, reset);
|
||||
`RESET_RELAY (issue_reset, reset);
|
||||
`RESET_RELAY (execute_reset, reset);
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
base_dcrs_t base_dcrs;
|
||||
|
||||
VX_dcr_data dcr_data (
|
||||
.clk (clk),
|
||||
.reset (dcr_data_reset),
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
.base_dcrs (base_dcrs)
|
||||
);
|
||||
|
||||
`SCOPE_IO_SWITCH (3)
|
||||
|
||||
VX_schedule #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) schedule (
|
||||
.clk (clk),
|
||||
.reset (schedule_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_schedule_if (pipeline_perf_if.schedule),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.decode_sched_if(decode_sched_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.schedule_if (schedule_if),
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (gbar_bus_if),
|
||||
`endif
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
VX_fetch #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) fetch (
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (fetch_reset),
|
||||
.icache_bus_if (icache_bus_if),
|
||||
.schedule_if (schedule_if),
|
||||
.fetch_if (fetch_if)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
.fetch_if (fetch_if),
|
||||
.decode_if (decode_if),
|
||||
.decode_sched_if(decode_sched_if)
|
||||
);
|
||||
|
||||
VX_issue #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) issue (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_issue_if (pipeline_perf_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
`endif
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
|
||||
VX_execute #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) execute (
|
||||
`SCOPE_IO_BIND (2)
|
||||
|
||||
.clk (clk),
|
||||
.reset (execute_reset),
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
.dcache_bus_if (dcache_bus_tmp_if),
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
.sfu_dispatch_if(sfu_dispatch_if),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.lsu_commit_if (lsu_commit_if),
|
||||
.sfu_commit_if (sfu_commit_if),
|
||||
|
||||
.sim_ebreak (sim_ebreak)
|
||||
);
|
||||
|
||||
VX_commit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) commit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.lsu_commit_if (lsu_commit_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
.sfu_commit_if (sfu_commit_if),
|
||||
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.sim_wb_value (sim_wb_value)
|
||||
);
|
||||
|
||||
`ifdef SM_ENABLE
|
||||
|
||||
VX_smem_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) smem_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.smem),
|
||||
`endif
|
||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||
.dcache_bus_out_if (dcache_bus_if),
|
||||
.smem_bus_out_if (smem_bus_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
|
||||
wire [1:0] perf_icache_pending_read_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
|
||||
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
|
||||
`BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
|
||||
|
||||
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
|
||||
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
|
||||
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
|
||||
|
||||
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
|
||||
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_icache_pending_reads <= '0;
|
||||
perf_dcache_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
||||
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ifetches <= '0;
|
||||
perf_loads <= '0;
|
||||
perf_stores <= '0;
|
||||
perf_icache_lat <= '0;
|
||||
perf_dcache_lat <= '0;
|
||||
end else begin
|
||||
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
||||
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
|
||||
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign pipeline_perf_if.ifetches = perf_ifetches;
|
||||
assign pipeline_perf_if.loads = perf_loads;
|
||||
assign pipeline_perf_if.stores = perf_stores;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -129,7 +129,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
||||
assign icache_rsp_ready = icache_bus_if.rsp_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = '0;
|
||||
assign mem_perf_if.dcache = '0;
|
||||
assign mem_perf_if.l2cache = '0;
|
||||
assign mem_perf_if.l3cache = '0;
|
||||
assign mem_perf_if.smem = '0;
|
||||
assign mem_perf_if.mem = '0;
|
||||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
|
||||
@@ -35,7 +35,6 @@ import VX_fpu_pkg::*;
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
VX_sfu_perf_if.slave sfu_perf_if,
|
||||
`endif
|
||||
|
||||
VX_commit_csr_if.slave commit_csr_if,
|
||||
@@ -183,105 +182,115 @@ import VX_fpu_pkg::*;
|
||||
|
||||
default: begin
|
||||
read_addr_valid_r = 0;
|
||||
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|
||||
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
|
||||
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|
||||
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
|
||||
read_addr_valid_r = 1;
|
||||
`ifdef PERF_ENABLE
|
||||
case (base_dcrs.mpm_class)
|
||||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
|
||||
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
|
||||
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
|
||||
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0];
|
||||
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||
`else
|
||||
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||
`endif
|
||||
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
|
||||
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
|
||||
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
|
||||
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
|
||||
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
|
||||
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`VX_DCR_MPM_CLASS_MEM: begin
|
||||
case (read_addr)
|
||||
// PERF: icache
|
||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: dcache
|
||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: smem
|
||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: l2cache
|
||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: l3cache
|
||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
@@ -301,8 +310,6 @@ import VX_fpu_pkg::*;
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
|
||||
`UNUSED_VAR (perf_wctl_stalls);
|
||||
`UNUSED_VAR (mem_perf_if.icache);
|
||||
`UNUSED_VAR (mem_perf_if.smem);
|
||||
`endif
|
||||
|
||||
@@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
VX_sfu_perf_if.slave sfu_perf_if,
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sfu_perf_if (sfu_perf_if),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "VX_trace.vh"
|
||||
|
||||
module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
@@ -174,30 +175,38 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r;
|
||||
wire [`ISSUE_WIDTH-1:0] operands_stall;
|
||||
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type;
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
|
||||
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
|
||||
|
||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready;
|
||||
assign operands_ex_type[i] = operands_if[i].data.ex_type;
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
perf_stalls_n = perf_stalls_r;
|
||||
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||
if (operands_stall[i]) begin
|
||||
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
|
||||
always @(*) begin
|
||||
perf_issue_unit_stalls_per_cycle[i] = '0;
|
||||
if (operands_if[i].valid && ~operands_if[i].ready) begin
|
||||
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_stalls_r <= '0;
|
||||
end else begin
|
||||
perf_stalls_r <= perf_stalls_n;
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (`ISSUE_WIDTH),
|
||||
.OP ("|")
|
||||
) reduce (
|
||||
.data_in (perf_issue_unit_stalls_per_cycle),
|
||||
.data_out (perf_unit_stalls_per_cycle)
|
||||
);
|
||||
|
||||
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_stalls_r[i] <= '0;
|
||||
end else begin
|
||||
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
batch_idx <= '0;
|
||||
end else if (batch_done) begin
|
||||
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
|
||||
end else begin
|
||||
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
@@ -203,20 +203,20 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
||||
assign block_done[block_idx] = ~valid_p || ready_p;
|
||||
end
|
||||
|
||||
wire [ISSUE_IDX_W-1:0] wsi;
|
||||
wire [ISSUE_ISW_W-1:0] isw;
|
||||
if (BATCH_COUNT != 1) begin
|
||||
if (BLOCK_SIZE != 1) begin
|
||||
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)};
|
||||
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
|
||||
end else begin
|
||||
assign wsi = batch_idx;
|
||||
assign isw = batch_idx;
|
||||
end
|
||||
end else begin
|
||||
assign wsi = block_idx;
|
||||
assign isw = block_idx;
|
||||
end
|
||||
|
||||
`RESET_RELAY(buf_out_reset, reset);
|
||||
|
||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi);
|
||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (OUT_DATAW),
|
||||
|
||||
259
hw/rtl/core/VX_fpu_unit.sv
Normal file
259
hw/rtl/core/VX_fpu_unit.sv
Normal file
@@ -0,0 +1,259 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
module VX_fpu_unit import VX_fpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
|
||||
VX_fpu_to_csr_if.master fpu_to_csr_if[`NUM_FPU_BLOCKS],
|
||||
|
||||
VX_commit_if.master commit_if [`ISSUE_WIDTH]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
|
||||
localparam NUM_LANES = `NUM_FPU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam TAG_WIDTH = `LOG2UP(`FPUQ_SIZE);
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) execute_if[BLOCK_SIZE]();
|
||||
|
||||
`RESET_RELAY (dispatch_reset, reset);
|
||||
|
||||
VX_dispatch_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 0)
|
||||
) dispatch_unit (
|
||||
.clk (clk),
|
||||
.reset (dispatch_reset),
|
||||
.dispatch_if(dispatch_if),
|
||||
.execute_if (execute_if)
|
||||
);
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) commit_block_if[BLOCK_SIZE]();
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
`UNUSED_VAR (execute_if[block_idx].data.tid)
|
||||
`UNUSED_VAR (execute_if[block_idx].data.wb)
|
||||
`UNUSED_VAR (execute_if[block_idx].data.use_PC)
|
||||
`UNUSED_VAR (execute_if[block_idx].data.use_imm)
|
||||
|
||||
// Store request info
|
||||
wire fpu_req_valid, fpu_req_ready;
|
||||
wire fpu_rsp_valid, fpu_rsp_ready;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] fpu_rsp_result;
|
||||
fflags_t fpu_rsp_fflags;
|
||||
wire fpu_rsp_has_fflags;
|
||||
|
||||
wire [`UUID_WIDTH-1:0] fpu_rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] fpu_rsp_wid;
|
||||
wire [NUM_LANES-1:0] fpu_rsp_tmask;
|
||||
wire [`XLEN-1:0] fpu_rsp_PC;
|
||||
wire [`NR_BITS-1:0] fpu_rsp_rd;
|
||||
wire [PID_WIDTH-1:0] fpu_rsp_pid;
|
||||
wire fpu_rsp_sop;
|
||||
wire fpu_rsp_eop;
|
||||
|
||||
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
|
||||
wire mdata_full;
|
||||
|
||||
wire [`INST_FMT_BITS-1:0] fpu_fmt = execute_if[block_idx].data.imm[`INST_FMT_BITS-1:0];
|
||||
wire [`INST_FRM_BITS-1:0] fpu_frm = execute_if[block_idx].data.op_mod[`INST_FRM_BITS-1:0];
|
||||
|
||||
wire execute_fire = execute_if[block_idx].valid && execute_if[block_idx].ready;
|
||||
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
|
||||
.SIZE (`FPUQ_SIZE)
|
||||
) tag_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.acquire_en (execute_fire),
|
||||
.write_addr (fpu_req_tag),
|
||||
.write_data ({execute_if[block_idx].data.uuid, execute_if[block_idx].data.wid, execute_if[block_idx].data.tmask, execute_if[block_idx].data.PC, execute_if[block_idx].data.rd, execute_if[block_idx].data.pid, execute_if[block_idx].data.sop, execute_if[block_idx].data.eop}),
|
||||
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
|
||||
.read_addr (fpu_rsp_tag),
|
||||
.release_en (fpu_rsp_fire),
|
||||
.full (mdata_full),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
|
||||
// resolve dynamic FRM from CSR
|
||||
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
|
||||
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].read_wid, execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
|
||||
assign fpu_req_frm = (execute_if[block_idx].data.op_type != `INST_FPU_MISC
|
||||
&& fpu_frm == `INST_FRM_DYN) ? fpu_to_csr_if[block_idx].read_frm : fpu_frm;
|
||||
|
||||
// submit FPU request
|
||||
|
||||
assign fpu_req_valid = execute_if[block_idx].valid && ~mdata_full;
|
||||
assign execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
|
||||
|
||||
`RESET_RELAY (fpu_reset, reset);
|
||||
|
||||
`ifdef FPU_DPI
|
||||
|
||||
VX_fpu_dpi #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAG_WIDTH),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_dpi (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.op_type (execute_if[block_idx].data.op_type),
|
||||
.lane_mask (execute_if[block_idx].data.tmask),
|
||||
.fmt (fpu_fmt),
|
||||
.frm (fpu_req_frm),
|
||||
.dataa (execute_if[block_idx].data.rs1_data),
|
||||
.datab (execute_if[block_idx].data.rs2_data),
|
||||
.datac (execute_if[block_idx].data.rs3_data),
|
||||
.tag_in (fpu_req_tag),
|
||||
.ready_in (fpu_req_ready),
|
||||
|
||||
.valid_out (fpu_rsp_valid),
|
||||
.result (fpu_rsp_result),
|
||||
.has_fflags (fpu_rsp_has_fflags),
|
||||
.fflags (fpu_rsp_fflags),
|
||||
.tag_out (fpu_rsp_tag),
|
||||
.ready_out (fpu_rsp_ready)
|
||||
);
|
||||
|
||||
`elsif FPU_FPNEW
|
||||
|
||||
VX_fpu_fpnew #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAG_WIDTH),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_fpnew (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.op_type (execute_if[block_idx].data.op_type),
|
||||
.lane_mask (execute_if[block_idx].data.tmask),
|
||||
.fmt (fpu_fmt),
|
||||
.frm (fpu_req_frm),
|
||||
.dataa (execute_if[block_idx].data.rs1_data),
|
||||
.datab (execute_if[block_idx].data.rs2_data),
|
||||
.datac (execute_if[block_idx].data.rs3_data),
|
||||
.tag_in (fpu_req_tag),
|
||||
.ready_in (fpu_req_ready),
|
||||
|
||||
.valid_out (fpu_rsp_valid),
|
||||
.result (fpu_rsp_result),
|
||||
.has_fflags (fpu_rsp_has_fflags),
|
||||
.fflags (fpu_rsp_fflags),
|
||||
.tag_out (fpu_rsp_tag),
|
||||
.ready_out (fpu_rsp_ready)
|
||||
);
|
||||
|
||||
`elsif FPU_DSP
|
||||
|
||||
VX_fpu_dsp #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAG_WIDTH),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_dsp (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.lane_mask (execute_if[block_idx].data.tmask),
|
||||
.op_type (execute_if[block_idx].data.op_type),
|
||||
.fmt (fpu_fmt),
|
||||
.frm (fpu_req_frm),
|
||||
.dataa (execute_if[block_idx].data.rs1_data),
|
||||
.datab (execute_if[block_idx].data.rs2_data),
|
||||
.datac (execute_if[block_idx].data.rs3_data),
|
||||
.tag_in (fpu_req_tag),
|
||||
.ready_in (fpu_req_ready),
|
||||
|
||||
.valid_out (fpu_rsp_valid),
|
||||
.result (fpu_rsp_result),
|
||||
.has_fflags (fpu_rsp_has_fflags),
|
||||
.fflags (fpu_rsp_fflags),
|
||||
.tag_out (fpu_rsp_tag),
|
||||
.ready_out (fpu_rsp_ready)
|
||||
);
|
||||
|
||||
`endif
|
||||
|
||||
// handle FPU response
|
||||
|
||||
fflags_t fpu_rsp_fflags_q;
|
||||
|
||||
if (PID_BITS != 0) begin
|
||||
fflags_t fpu_rsp_fflags_r;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
fpu_rsp_fflags_r <= '0;
|
||||
end else if (fpu_rsp_fire) begin
|
||||
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
|
||||
end
|
||||
end
|
||||
assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
|
||||
end else begin
|
||||
assign fpu_rsp_fflags_q = fpu_rsp_fflags;
|
||||
end
|
||||
|
||||
assign fpu_to_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
|
||||
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
|
||||
assign fpu_to_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
|
||||
|
||||
// send response
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.SIZE (0)
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (fpu_rsp_valid),
|
||||
.ready_in (fpu_rsp_ready),
|
||||
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
|
||||
.data_out ({commit_block_if[block_idx].data.uuid, commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.tmask, commit_block_if[block_idx].data.PC, commit_block_if[block_idx].data.rd, commit_block_if[block_idx].data.data, commit_block_if[block_idx].data.pid, commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop}),
|
||||
.valid_out (commit_block_if[block_idx].valid),
|
||||
.ready_out (commit_block_if[block_idx].ready)
|
||||
);
|
||||
assign commit_block_if[block_idx].data.wb = 1'b1;
|
||||
end
|
||||
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
VX_gather_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (PARTIAL_BW ? 3 : 0)
|
||||
) gather_unit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
.commit_in_if (commit_block_if),
|
||||
.commit_out_if (commit_if)
|
||||
);
|
||||
|
||||
endmodule
|
||||
@@ -37,7 +37,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
||||
wire [BLOCK_SIZE-1:0] commit_in_valid;
|
||||
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
|
||||
wire [BLOCK_SIZE-1:0] commit_in_ready;
|
||||
wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi;
|
||||
wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
|
||||
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
assign commit_in_valid[i] = commit_in_if[i].valid;
|
||||
@@ -45,12 +45,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
||||
assign commit_in_if[i].ready = commit_in_ready[i];
|
||||
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
|
||||
if (BLOCK_SIZE != 1) begin
|
||||
assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
|
||||
assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
|
||||
end else begin
|
||||
assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W];
|
||||
assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
|
||||
end
|
||||
end else begin
|
||||
assign commit_in_wsi[i] = BLOCK_SIZE_W'(i);
|
||||
assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
|
||||
end
|
||||
end
|
||||
|
||||
@@ -64,12 +64,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
||||
commit_out_data[i] = 'x;
|
||||
end
|
||||
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i];
|
||||
commit_out_data[commit_in_wsi[i]] = commit_in_data[i];
|
||||
commit_out_valid[commit_in_isw[i]] = commit_in_valid[i];
|
||||
commit_out_data[commit_in_isw[i]] = commit_in_data[i];
|
||||
end
|
||||
end
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]];
|
||||
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
|
||||
@@ -14,10 +14,10 @@
|
||||
`include "VX_platform.vh"
|
||||
|
||||
module VX_ipdom_stack #(
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1,
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter ADDRW = `LOG2UP(DEPTH)
|
||||
parameter ADDRW = `LOG2UP(DEPTH)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
@@ -59,6 +59,11 @@ module VX_issue #(
|
||||
) scoreboard (
|
||||
.clk (clk),
|
||||
.reset (scoreboard_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
||||
.perf_units_uses(perf_issue_if.units_uses),
|
||||
.perf_sfu_uses (perf_issue_if.sfu_uses),
|
||||
`endif
|
||||
.writeback_if (writeback_if),
|
||||
.ibuffer_if (ibuffer_if),
|
||||
.scoreboard_if (scoreboard_if)
|
||||
@@ -80,7 +85,7 @@ module VX_issue #(
|
||||
.clk (clk),
|
||||
.reset (dispatch_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_stalls (perf_issue_if.dsp_stalls),
|
||||
`UNUSED_PIN (perf_stalls),
|
||||
`endif
|
||||
.operands_if (operands_if),
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
@@ -152,29 +157,18 @@ module VX_issue #(
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
|
||||
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
|
||||
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
|
||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||
end
|
||||
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
|
||||
|
||||
wire decode_stall = decode_if.valid && ~decode_if.ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ibf_stalls <= '0;
|
||||
perf_scb_stalls <= '0;
|
||||
end else begin
|
||||
if (decode_if.valid && ~decode_if.ready) begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
||||
assign perf_issue_if.scb_stalls = perf_scb_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
// detect duplicate addresses
|
||||
|
||||
wire lsu_is_dup;
|
||||
`ifdef LSU_DUP
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
if (NUM_LANES > 1) begin
|
||||
wire [NUM_LANES-2:0] addr_matches;
|
||||
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
|
||||
@@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
|
||||
assign mem_req_tag = {
|
||||
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
|
||||
`ifdef LSU_DUP
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
, lsu_is_dup
|
||||
`endif
|
||||
};
|
||||
@@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
wire [PID_WIDTH-1:0] rsp_pid;
|
||||
wire rsp_is_dup;
|
||||
|
||||
`ifndef LSU_DUP
|
||||
`ifndef LSU_DUP_ENABLE
|
||||
assign rsp_is_dup = 0;
|
||||
`endif
|
||||
|
||||
assign {
|
||||
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
|
||||
`ifdef LSU_DUP
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
, rsp_is_dup
|
||||
`endif
|
||||
} = mem_rsp_tag;
|
||||
@@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.OUT_REG (1)
|
||||
.OUT_REG (3)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
||||
@@ -220,8 +220,13 @@ module VX_muldiv_unit #(
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef XLEN_64
|
||||
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
|
||||
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
|
||||
`else
|
||||
assign div_in1[i] = execute_if.data.rs1_data[i];
|
||||
assign div_in2[i] = execute_if.data.rs2_data[i];
|
||||
`endif
|
||||
end
|
||||
|
||||
`ifdef IDIV_DPI
|
||||
|
||||
@@ -26,6 +26,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
|
||||
|
||||
localparam STATE_IDLE = 2'd0;
|
||||
localparam STATE_FETCH1 = 2'd1;
|
||||
@@ -38,14 +39,19 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
|
||||
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
|
||||
|
||||
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
|
||||
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
|
||||
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
|
||||
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
|
||||
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
|
||||
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
|
||||
|
||||
reg valid_out_r;
|
||||
reg [DATAW-1:0] data_out_r;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
|
||||
|
||||
reg [STATE_BITS-1:0] state, state_n;
|
||||
reg [`NR_BITS-1:0] rs2, rs2_n;
|
||||
@@ -54,11 +60,11 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
reg rs3_ready, rs3_ready_n;
|
||||
reg data_ready, data_ready_n;
|
||||
|
||||
wire ready_out = operands_if[i].ready;
|
||||
|
||||
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
|
||||
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
|
||||
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
|
||||
|
||||
VX_operands_if staging_if();
|
||||
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
|
||||
|
||||
always @(*) begin
|
||||
state_n = state;
|
||||
@@ -79,7 +85,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
|
||||
case (state)
|
||||
STATE_IDLE: begin
|
||||
if (staging_if.valid && staging_if.ready) begin
|
||||
if (valid_out_r && ready_out) begin
|
||||
data_ready_n = 0;
|
||||
end
|
||||
if (scoreboard_if[i].valid && data_ready_n == 0) begin
|
||||
@@ -160,44 +166,93 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
end
|
||||
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
|
||||
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
|
||||
if (writeback_if[i].data.sop) begin
|
||||
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
|
||||
end else begin
|
||||
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
|
||||
end
|
||||
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
|
||||
(cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
if (reset) begin
|
||||
state <= STATE_IDLE;
|
||||
gpr_rd_rid <= '0;
|
||||
gpr_rd_wis <= '0;
|
||||
cache_eop <= {ISSUE_RATIO{1'b1}};
|
||||
cache_reg <= '0;
|
||||
data_ready <= 0;
|
||||
valid_out_r <= 0;
|
||||
end else begin
|
||||
state <= state_n;
|
||||
rs2 <= rs2_n;
|
||||
rs3 <= rs3_n;
|
||||
rs2_ready <= rs2_ready_n;
|
||||
rs3_ready <= rs3_ready_n;
|
||||
rs1_data <= rs1_data_n;
|
||||
rs2_data <= rs2_data_n;
|
||||
rs3_data <= rs3_data_n;
|
||||
gpr_rd_rid <= gpr_rd_rid_n;
|
||||
gpr_rd_wis <= gpr_rd_wis_n;
|
||||
cache_data <= cache_data_n;
|
||||
cache_reg <= cache_reg_n;
|
||||
cache_tmask <= cache_tmask_n;
|
||||
cache_eop <= cache_eop_n;
|
||||
data_ready <= data_ready_n;
|
||||
data_ready <= data_ready_n;
|
||||
if (~valid_out_r) begin
|
||||
valid_out_r <= scoreboard_if[i].valid && data_ready;
|
||||
end else if (ready_out) begin
|
||||
valid_out_r <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (~valid_out_r) begin
|
||||
data_out_r <= {scoreboard_if[i].data.uuid,
|
||||
scoreboard_if[i].data.wis,
|
||||
scoreboard_if[i].data.tmask,
|
||||
scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.wb,
|
||||
scoreboard_if[i].data.ex_type,
|
||||
scoreboard_if[i].data.op_type,
|
||||
scoreboard_if[i].data.op_mod,
|
||||
scoreboard_if[i].data.use_PC,
|
||||
scoreboard_if[i].data.use_imm,
|
||||
scoreboard_if[i].data.imm,
|
||||
scoreboard_if[i].data.rd};
|
||||
end
|
||||
|
||||
gpr_rd_rid <= gpr_rd_rid_n;
|
||||
gpr_rd_wis <= gpr_rd_wis_n;
|
||||
rs2_ready <= rs2_ready_n;
|
||||
rs3_ready <= rs3_ready_n;
|
||||
rs2 <= rs2_n;
|
||||
rs3 <= rs3_n;
|
||||
rs1_data <= rs1_data_n;
|
||||
rs2_data <= rs2_data_n;
|
||||
rs3_data <= rs3_data_n;
|
||||
cache_data <= cache_data_n;
|
||||
cache_reg <= cache_reg_n;
|
||||
cache_tmask <= cache_tmask_n;
|
||||
end
|
||||
|
||||
assign operands_if[i].valid = valid_out_r;
|
||||
assign {operands_if[i].data.uuid,
|
||||
operands_if[i].data.wis,
|
||||
operands_if[i].data.tmask,
|
||||
operands_if[i].data.PC,
|
||||
operands_if[i].data.wb,
|
||||
operands_if[i].data.ex_type,
|
||||
operands_if[i].data.op_type,
|
||||
operands_if[i].data.op_mod,
|
||||
operands_if[i].data.use_PC,
|
||||
operands_if[i].data.use_imm,
|
||||
operands_if[i].data.imm,
|
||||
operands_if[i].data.rd} = data_out_r;
|
||||
assign operands_if[i].data.rs1_data = rs1_data;
|
||||
assign operands_if[i].data.rs2_data = rs2_data;
|
||||
assign operands_if[i].data.rs3_data = rs3_data;
|
||||
|
||||
assign scoreboard_if[i].ready = ~valid_out_r && data_ready;
|
||||
|
||||
// GPR banks
|
||||
|
||||
reg [RAM_ADDRW-1:0] gpr_rd_addr;
|
||||
wire [RAM_ADDRW-1:0] gpr_wr_addr;
|
||||
if (ISSUE_WIS != 0) begin
|
||||
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
|
||||
always @(posedge clk) begin
|
||||
gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n};
|
||||
end
|
||||
end else begin
|
||||
assign gpr_wr_addr = writeback_if[i].data.rd;
|
||||
always @(posedge clk) begin
|
||||
gpr_rd_addr <= gpr_rd_rid_n;
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef GPR_RESET
|
||||
reg wr_enabled = 0;
|
||||
always @(posedge clk) begin
|
||||
@@ -205,10 +260,8 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
wr_enabled <= 1;
|
||||
end
|
||||
end
|
||||
`else
|
||||
wire wr_enabled = 1;
|
||||
`endif
|
||||
|
||||
|
||||
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
@@ -222,81 +275,17 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.read (1'b1),
|
||||
`UNUSED_PIN (wren),
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
.raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)),
|
||||
.raddr (gpr_rd_addr),
|
||||
.rdata (gpr_rd_data[j])
|
||||
);
|
||||
end
|
||||
|
||||
// staging buffer
|
||||
|
||||
`RESET_RELAY (stg_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW)
|
||||
) stg_buf (
|
||||
.clk (clk),
|
||||
.reset (stg_buf_reset),
|
||||
.valid_in (scoreboard_if[i].valid),
|
||||
.ready_in (scoreboard_if[i].ready),
|
||||
.data_in ({
|
||||
scoreboard_if[i].data.uuid,
|
||||
scoreboard_if[i].data.wis,
|
||||
scoreboard_if[i].data.tmask,
|
||||
scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.wb,
|
||||
scoreboard_if[i].data.ex_type,
|
||||
scoreboard_if[i].data.op_type,
|
||||
scoreboard_if[i].data.op_mod,
|
||||
scoreboard_if[i].data.use_PC,
|
||||
scoreboard_if[i].data.use_imm,
|
||||
scoreboard_if[i].data.imm,
|
||||
scoreboard_if[i].data.rd}),
|
||||
.data_out ({
|
||||
staging_if.data.uuid,
|
||||
staging_if.data.wis,
|
||||
staging_if.data.tmask,
|
||||
staging_if.data.PC,
|
||||
staging_if.data.wb,
|
||||
staging_if.data.ex_type,
|
||||
staging_if.data.op_type,
|
||||
staging_if.data.op_mod,
|
||||
staging_if.data.use_PC,
|
||||
staging_if.data.use_imm,
|
||||
staging_if.data.imm,
|
||||
staging_if.data.rd}),
|
||||
.valid_out (staging_if.valid),
|
||||
.ready_out (staging_if.ready)
|
||||
);
|
||||
|
||||
assign staging_if.data.rs1_data = rs1_data;
|
||||
assign staging_if.data.rs2_data = rs2_data;
|
||||
assign staging_if.data.rs3_data = rs3_data;
|
||||
|
||||
// output buffer
|
||||
|
||||
wire valid_stg, ready_stg;
|
||||
assign valid_stg = staging_if.valid && data_ready;
|
||||
assign staging_if.ready = ready_stg && data_ready;
|
||||
|
||||
`RESET_RELAY (out_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
|
||||
.SIZE (2),
|
||||
.OUT_REG (2)
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (out_buf_reset),
|
||||
.valid_in (valid_stg),
|
||||
.ready_in (ready_stg),
|
||||
.data_in (staging_if.data),
|
||||
.data_out (operands_if[i].data),
|
||||
.valid_out (operands_if[i].valid),
|
||||
.ready_out (operands_if[i].ready)
|
||||
);
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_pipeline_perf_if.schedule perf_schedule_if,
|
||||
`endif
|
||||
|
||||
// configuration
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
||||
@@ -304,13 +308,20 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
|
||||
reg [`UUID_WIDTH-1:0] instr_uuid;
|
||||
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
|
||||
`ifdef SV_DPI
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
|
||||
end else if (schedule_fire) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
|
||||
end
|
||||
end
|
||||
end
|
||||
`else
|
||||
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
|
||||
always @(*) begin
|
||||
instr_uuid = `UUID_WIDTH'(w_uuid);
|
||||
end
|
||||
`endif
|
||||
`else
|
||||
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
|
||||
`endif
|
||||
@@ -349,7 +360,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
.empty (no_pending_instr)
|
||||
);
|
||||
|
||||
`BUFFER_BUSY (busy, (active_warps != 0 || ~no_pending_instr), 1);
|
||||
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
|
||||
|
||||
// export CSRs
|
||||
assign sched_csr_if.cycles = cycles;
|
||||
@@ -376,4 +387,25 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
end
|
||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||
|
||||
wire schedule_idle = ~schedule_valid;
|
||||
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sched_idles <= '0;
|
||||
perf_sched_stalls <= '0;
|
||||
end else begin
|
||||
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_schedule_if.sched_idles = perf_sched_idles;
|
||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -19,6 +19,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
|
||||
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
|
||||
@@ -26,114 +32,201 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
|
||||
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
|
||||
|
||||
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (`ISSUE_WIDTH),
|
||||
.OP ("|")
|
||||
) perf_units_reduce (
|
||||
.data_in (perf_issue_units_per_cycle),
|
||||
.data_out (perf_units_per_cycle)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.N (`ISSUE_WIDTH),
|
||||
.OP ("|")
|
||||
) perf_sfu_reduce (
|
||||
.data_in (perf_issue_sfu_per_cycle),
|
||||
.data_out (perf_sfu_per_cycle)
|
||||
);
|
||||
|
||||
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
|
||||
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scb_stalls <= '0;
|
||||
end else begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_units_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sfu_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
reg [3:0] ready_masks, ready_masks_n;
|
||||
VX_ibuffer_if staging_if();
|
||||
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
|
||||
|
||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||
|
||||
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
|
||||
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
|
||||
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
|
||||
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
||||
reg [`SFU_WIDTH-1:0] sfu_type;
|
||||
always @(*) begin
|
||||
inuse_regs_n = inuse_regs;
|
||||
ready_masks_n = ready_masks;
|
||||
if (writeback_fire) begin
|
||||
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0;
|
||||
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
|
||||
& {(writeback_if[i].data.rd == staging_if.data.rd),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs1),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs2),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs3)};
|
||||
end
|
||||
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin
|
||||
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1;
|
||||
ready_masks_n = '0;
|
||||
case (scoreboard_if[i].data.op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
|
||||
default: sfu_type = `SFU_WCTL;
|
||||
endcase
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
perf_issue_units_per_cycle[i] = '0;
|
||||
perf_issue_sfu_per_cycle[i] = '0;
|
||||
if (ibuffer_if[i].valid) begin
|
||||
if (inuse_rd) begin
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs1) begin
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs2) begin
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs3) begin
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]};
|
||||
end
|
||||
end
|
||||
end
|
||||
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||
`endif
|
||||
|
||||
reg [DATAW-1:0] data_out_r;
|
||||
reg valid_out_r;
|
||||
wire ready_out;
|
||||
|
||||
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||
wire deps_ready = (& ready_masks);
|
||||
|
||||
wire valid_in = ibuffer_if[i].valid && deps_ready;
|
||||
wire ready_in = ~valid_out_r && deps_ready;
|
||||
wire [DATAW-1:0] data_in = ibuffer_if[i].data;
|
||||
|
||||
assign ready_out = scoreboard_if[i].ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
inuse_regs <= '0;
|
||||
ready_masks <= '0;
|
||||
end else begin
|
||||
inuse_regs <= inuse_regs_n;
|
||||
ready_masks <= ready_masks_n;
|
||||
valid_out_r <= 0;
|
||||
inuse_regs <= '0;
|
||||
end else begin
|
||||
if (writeback_fire) begin
|
||||
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
|
||||
end
|
||||
if (~valid_out_r) begin
|
||||
valid_out_r <= valid_in;
|
||||
end else if (ready_out) begin
|
||||
if (scoreboard_if[i].data.wb) begin
|
||||
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
|
||||
`ifdef PERF_ENABLE
|
||||
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
|
||||
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
|
||||
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
valid_out_r <= 0;
|
||||
end
|
||||
end
|
||||
if (~valid_out_r) begin
|
||||
data_out_r <= data_in;
|
||||
end
|
||||
end
|
||||
|
||||
// staging buffer
|
||||
assign ibuffer_if[i].ready = ready_in;
|
||||
assign scoreboard_if[i].valid = valid_out_r;
|
||||
assign scoreboard_if[i].data = data_out_r;
|
||||
|
||||
`RESET_RELAY (stg_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW)
|
||||
) stg_buf (
|
||||
.clk (clk),
|
||||
.reset (stg_buf_reset),
|
||||
.valid_in (ibuffer_if[i].valid),
|
||||
.ready_in (ibuffer_if[i].ready),
|
||||
.data_in (ibuffer_if[i].data),
|
||||
.data_out (staging_if.data),
|
||||
.valid_out (staging_if.valid),
|
||||
.ready_out (staging_if.ready)
|
||||
);
|
||||
`ifdef SIMULATION
|
||||
reg [31:0] timeout_ctr;
|
||||
|
||||
// output buffer
|
||||
|
||||
wire valid_stg, ready_stg;
|
||||
wire regs_ready = (& ready_masks);
|
||||
assign valid_stg = staging_if.valid && regs_ready;
|
||||
assign staging_if.ready = ready_stg && regs_ready;
|
||||
|
||||
`RESET_RELAY (out_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2),
|
||||
.OUT_REG (2)
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (out_buf_reset),
|
||||
.valid_in (valid_stg),
|
||||
.ready_in (ready_stg),
|
||||
.data_in (staging_if.data),
|
||||
.data_out (scoreboard_if[i].data),
|
||||
.valid_out (scoreboard_if[i].valid),
|
||||
.ready_out (scoreboard_if[i].ready)
|
||||
);
|
||||
|
||||
reg [31:0] timeout_ctr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
timeout_ctr <= '0;
|
||||
end else begin
|
||||
if (staging_if.valid && ~regs_ready) begin
|
||||
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
||||
~ready_masks, staging_if.data.uuid));
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||
~ready_masks, ibuffer_if[i].data.uuid));
|
||||
`endif
|
||||
timeout_ctr <= timeout_ctr + 1;
|
||||
end else if (staging_if.valid && staging_if.ready) begin
|
||||
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||
timeout_ctr <= '0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
||||
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
||||
~ready_masks, staging_if.data.uuid));
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||
~ready_masks, ibuffer_if[i].data.uuid));
|
||||
|
||||
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
|
||||
end
|
||||
`endif
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_ARB_SIZE = 1 + 1;
|
||||
localparam RSP_ARB_IDX_WCTL = 0;
|
||||
localparam RSP_ARB_IDX_CSR = 1;
|
||||
localparam RSP_ARB_IDX_CSRS = 1;
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
@@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
||||
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_sfu_perf_if sfu_perf_if();
|
||||
`endif
|
||||
|
||||
// Warp control block
|
||||
VX_execute_if #(
|
||||
@@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sfu_perf_if (sfu_perf_if),
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -141,21 +137,21 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
.commit_if (csr_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data;
|
||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR];
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
|
||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
|
||||
|
||||
// can accept new request?
|
||||
|
||||
reg sfu_req_ready;
|
||||
always @(*) begin
|
||||
case (execute_if[0].data.op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||
default: sfu_req_ready = wctl_execute_if.ready;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
assign execute_if[0].ready = sfu_req_ready;
|
||||
|
||||
// response arbitration
|
||||
@@ -170,7 +166,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
.NUM_INPUTS (RSP_ARB_SIZE),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
.OUT_REG (3)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
@@ -186,7 +182,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
VX_gather_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (3)
|
||||
.OUT_REG (1)
|
||||
) gather_unit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
@@ -194,16 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
.commit_out_if (commit_if)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_wctl_stalls <= '0;
|
||||
end else begin
|
||||
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready);
|
||||
end
|
||||
end
|
||||
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -14,9 +14,7 @@
|
||||
`ifndef VX_TRACE_VH
|
||||
`define VX_TRACE_VH
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
|
||||
`include "VX_define.vh"
|
||||
`ifdef SIMULATION
|
||||
|
||||
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
|
||||
case (ex_type)
|
||||
|
||||
@@ -29,7 +29,6 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam LANE_BITS = `CLOG2(NUM_LANES);
|
||||
localparam LANE_WIDTH = `UP(LANE_BITS);
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
||||
@@ -50,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
||||
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
|
||||
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
|
||||
|
||||
wire [LANE_WIDTH-1:0] tid;
|
||||
wire [`UP(LANE_BITS)-1:0] tid;
|
||||
if (LANE_BITS != 0) begin
|
||||
assign tid = execute_if.data.tid[0 +: LANE_BITS];
|
||||
end else begin
|
||||
|
||||
Reference in New Issue
Block a user