Implement WU architecture support
This commit is contained in:
@@ -15,7 +15,8 @@
|
||||
|
||||
module VX_execute import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter TENSOR_FP16 = 0
|
||||
parameter TENSOR_FP16 = 0,
|
||||
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
@@ -37,6 +38,11 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls,
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -46,7 +52,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
|
||||
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH],
|
||||
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS],
|
||||
VX_branch_ctl_if.master branch_ctl_if [2 * `NUM_ALU_BLOCKS],
|
||||
|
||||
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
|
||||
@@ -56,19 +62,33 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
VX_warp_ctl_if.master warp_ctl_if,
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_dispatch_if.slave tensor_alu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.slave tensor_lsu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.slave tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
|
||||
output wire tensor_csr_unlock_valid,
|
||||
output wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid,
|
||||
output wire tensor_tmc_valid,
|
||||
output wire [`NW_WIDTH-1:0] tensor_tmc_wid,
|
||||
output wire [`NUM_THREADS-1:0] tensor_tmc_tmask,
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if.master tensor_regfile_if,
|
||||
VX_tc_bus_if.master tensor_smem_A_if,
|
||||
output logic tensor_tmem_C_wen,
|
||||
output logic tensor_tmem_C_ren,
|
||||
output logic [8:0] tensor_tmem_C_waddr,
|
||||
output logic [8:0] tensor_tmem_C_raddr,
|
||||
output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
|
||||
output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
|
||||
input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
|
||||
VX_tc_bus_if.master tensor_smem_B_if,
|
||||
VX_tc_rf_if.master tensor_regfile_if[NUM_TENSOR_CORES],
|
||||
VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES],
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr,
|
||||
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata,
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr,
|
||||
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr,
|
||||
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
|
||||
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
|
||||
VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES],
|
||||
`endif
|
||||
`endif
|
||||
|
||||
@@ -83,23 +103,196 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS]();
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_warp_ctl_if scalar_warp_ctl_if();
|
||||
VX_warp_ctl_if tensor_warp_ctl_if();
|
||||
|
||||
localparam WARP_CTL_DATAW = `NW_WIDTH + $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
||||
|
||||
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_in;
|
||||
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_in;
|
||||
reg [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_r;
|
||||
reg [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_r;
|
||||
reg scalar_warp_ctl_valid_r;
|
||||
reg tensor_warp_ctl_valid_r;
|
||||
reg warp_ctl_rr;
|
||||
|
||||
assign scalar_warp_ctl_data_in = {scalar_warp_ctl_if.wid, scalar_warp_ctl_if.tmc, scalar_warp_ctl_if.wspawn, scalar_warp_ctl_if.split, scalar_warp_ctl_if.sjoin, scalar_warp_ctl_if.barrier};
|
||||
assign tensor_warp_ctl_data_in = {tensor_warp_ctl_if.wid, tensor_warp_ctl_if.tmc, tensor_warp_ctl_if.wspawn, tensor_warp_ctl_if.split, tensor_warp_ctl_if.sjoin, tensor_warp_ctl_if.barrier};
|
||||
|
||||
wire scalar_warp_ctl_candidate_valid = scalar_warp_ctl_valid_r || scalar_warp_ctl_if.valid;
|
||||
wire tensor_warp_ctl_candidate_valid = tensor_warp_ctl_valid_r || tensor_warp_ctl_if.valid;
|
||||
wire select_tensor_warp_ctl = tensor_warp_ctl_candidate_valid && (!scalar_warp_ctl_candidate_valid || warp_ctl_rr);
|
||||
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_out = scalar_warp_ctl_valid_r ? scalar_warp_ctl_data_r : scalar_warp_ctl_data_in;
|
||||
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_out = tensor_warp_ctl_valid_r ? tensor_warp_ctl_data_r : tensor_warp_ctl_data_in;
|
||||
wire [WARP_CTL_DATAW-1:0] selected_warp_ctl_data = select_tensor_warp_ctl ? tensor_warp_ctl_data_out : scalar_warp_ctl_data_out;
|
||||
|
||||
wire consume_scalar_warp_ctl_pending = !select_tensor_warp_ctl && scalar_warp_ctl_valid_r;
|
||||
wire consume_scalar_warp_ctl_input = !select_tensor_warp_ctl && !scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid;
|
||||
wire consume_tensor_warp_ctl_pending = select_tensor_warp_ctl && tensor_warp_ctl_valid_r;
|
||||
wire consume_tensor_warp_ctl_input = select_tensor_warp_ctl && !tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid;
|
||||
|
||||
assign warp_ctl_if.valid = scalar_warp_ctl_candidate_valid || tensor_warp_ctl_candidate_valid;
|
||||
assign {warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.sjoin, warp_ctl_if.barrier} = selected_warp_ctl_data;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
scalar_warp_ctl_valid_r <= 1'b0;
|
||||
tensor_warp_ctl_valid_r <= 1'b0;
|
||||
warp_ctl_rr <= 1'b0;
|
||||
end else begin
|
||||
if (scalar_warp_ctl_candidate_valid && tensor_warp_ctl_candidate_valid) begin
|
||||
warp_ctl_rr <= !select_tensor_warp_ctl;
|
||||
end
|
||||
|
||||
if (scalar_warp_ctl_valid_r) begin
|
||||
if (consume_scalar_warp_ctl_pending) begin
|
||||
scalar_warp_ctl_valid_r <= scalar_warp_ctl_if.valid;
|
||||
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
|
||||
end
|
||||
end else if (scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_input) begin
|
||||
scalar_warp_ctl_valid_r <= 1'b1;
|
||||
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
|
||||
end
|
||||
|
||||
if (tensor_warp_ctl_valid_r) begin
|
||||
if (consume_tensor_warp_ctl_pending) begin
|
||||
tensor_warp_ctl_valid_r <= tensor_warp_ctl_if.valid;
|
||||
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
|
||||
end
|
||||
end else if (tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_input) begin
|
||||
tensor_warp_ctl_valid_r <= 1'b1;
|
||||
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!(scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_pending),
|
||||
("%t: *** core%0d-scalar-warp-ctl-merge-overflow", $time, CORE_ID)
|
||||
)
|
||||
`RUNTIME_ASSERT(
|
||||
!(tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_pending),
|
||||
("%t: *** core%0d-tensor-warp-ctl-merge-overflow", $time, CORE_ID)
|
||||
)
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (alu_reset, reset);
|
||||
`RESET_RELAY (lsu_reset, reset);
|
||||
`RESET_RELAY (sfu_reset, reset);
|
||||
|
||||
VX_commit_if alu_scalar_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_alu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) alu_unit (
|
||||
.clk (clk),
|
||||
.reset (alu_reset),
|
||||
.dispatch_if (alu_dispatch_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.commit_if (alu_commit_if)
|
||||
.branch_ctl_if (branch_ctl_if[0 +: `NUM_ALU_BLOCKS]),
|
||||
.commit_if (alu_scalar_commit_if)
|
||||
);
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_commit_if alu_tensor_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
`RESET_RELAY (tensor_alu_reset, reset);
|
||||
|
||||
VX_alu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tensor_alu_unit (
|
||||
.clk (clk),
|
||||
.reset (tensor_alu_reset),
|
||||
.dispatch_if (tensor_alu_dispatch_if),
|
||||
.branch_ctl_if (branch_ctl_if[`NUM_ALU_BLOCKS +: `NUM_ALU_BLOCKS]),
|
||||
.commit_if (alu_tensor_commit_if)
|
||||
);
|
||||
|
||||
localparam ALU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_domain_commit
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (ALU_COMMIT_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
) alu_commit_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({alu_tensor_commit_if[i].valid, alu_scalar_commit_if[i].valid}),
|
||||
.ready_in ({alu_tensor_commit_if[i].ready, alu_scalar_commit_if[i].ready}),
|
||||
.data_in ({alu_tensor_commit_if[i].data, alu_scalar_commit_if[i].data}),
|
||||
.data_out (alu_commit_if[i].data),
|
||||
.valid_out (alu_commit_if[i].valid),
|
||||
.ready_out (alu_commit_if[i].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
||||
if (alu_scalar_commit_if[i].valid
|
||||
&& ((alu_scalar_commit_if[i].data.PC == 32'h80000010) || (alu_scalar_commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-execute-alu-scalar-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_scalar_commit_if[i].valid, alu_scalar_commit_if[i].ready,
|
||||
alu_scalar_commit_if[i].data.wid, alu_scalar_commit_if[i].data.PC,
|
||||
alu_scalar_commit_if[i].data.wb, alu_scalar_commit_if[i].data.rd,
|
||||
alu_scalar_commit_if[i].data.sop, alu_scalar_commit_if[i].data.eop,
|
||||
alu_scalar_commit_if[i].data.uuid));
|
||||
end
|
||||
if (alu_tensor_commit_if[i].valid
|
||||
&& ((alu_tensor_commit_if[i].data.PC == 32'h80000010) || (alu_tensor_commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-execute-alu-tensor-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_tensor_commit_if[i].valid, alu_tensor_commit_if[i].ready,
|
||||
alu_tensor_commit_if[i].data.wid, alu_tensor_commit_if[i].data.PC,
|
||||
alu_tensor_commit_if[i].data.wb, alu_tensor_commit_if[i].data.rd,
|
||||
alu_tensor_commit_if[i].data.sop, alu_tensor_commit_if[i].data.eop,
|
||||
alu_tensor_commit_if[i].data.uuid));
|
||||
end
|
||||
if (alu_commit_if[i].valid
|
||||
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
|
||||
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
|
||||
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
|
||||
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
|
||||
alu_commit_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
`else
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_commit_passthru
|
||||
assign alu_commit_if[i].valid = alu_scalar_commit_if[i].valid;
|
||||
assign alu_commit_if[i].data = alu_scalar_commit_if[i].data;
|
||||
assign alu_scalar_commit_if[i].ready = alu_commit_if[i].ready;
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
||||
if (alu_commit_if[i].valid
|
||||
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
|
||||
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
|
||||
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
|
||||
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
|
||||
alu_commit_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
`endif
|
||||
|
||||
`SCOPE_IO_SWITCH (1)
|
||||
|
||||
VX_commit_if lsu_scalar_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) scalar_lsu_bus_if[DCACHE_NUM_REQS]();
|
||||
|
||||
VX_lsu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) lsu_unit (
|
||||
@@ -107,11 +300,184 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (lsu_reset),
|
||||
.downstream_mem_busy (downstream_mem_busy),
|
||||
.cache_bus_if (dcache_bus_if),
|
||||
.cache_bus_if (scalar_lsu_bus_if),
|
||||
.dispatch_if (lsu_dispatch_if),
|
||||
.commit_if (lsu_commit_if)
|
||||
.commit_if (lsu_scalar_commit_if)
|
||||
);
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_commit_if lsu_tensor_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) tensor_lsu_bus_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`RESET_RELAY (tensor_lsu_reset, reset);
|
||||
|
||||
VX_lsu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tensor_lsu_unit (
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (tensor_lsu_reset),
|
||||
.downstream_mem_busy (downstream_mem_busy),
|
||||
.cache_bus_if (tensor_lsu_bus_if),
|
||||
.dispatch_if (tensor_lsu_dispatch_if),
|
||||
.commit_if (lsu_tensor_commit_if)
|
||||
);
|
||||
|
||||
localparam LSU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_domain_commit
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (LSU_COMMIT_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
) lsu_commit_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({lsu_tensor_commit_if[i].valid, lsu_scalar_commit_if[i].valid}),
|
||||
.ready_in ({lsu_tensor_commit_if[i].ready, lsu_scalar_commit_if[i].ready}),
|
||||
.data_in ({lsu_tensor_commit_if[i].data, lsu_scalar_commit_if[i].data}),
|
||||
.data_out (lsu_commit_if[i].data),
|
||||
.valid_out (lsu_commit_if[i].valid),
|
||||
.ready_out (lsu_commit_if[i].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
end
|
||||
|
||||
wire scalar_lsu_req_any;
|
||||
wire tensor_lsu_req_any;
|
||||
wire [DCACHE_NUM_REQS-1:0] scalar_lsu_req_valids;
|
||||
wire [DCACHE_NUM_REQS-1:0] tensor_lsu_req_valids;
|
||||
wire [DCACHE_NUM_REQS-1:0] lsu_req_fires;
|
||||
wire [DCACHE_NUM_REQS-1:0] lsu_rd_req_fires;
|
||||
wire [DCACHE_NUM_REQS-1:0] lsu_rsp_fires;
|
||||
reg lsu_domain_rr;
|
||||
reg lsu_active_domain;
|
||||
reg [15:0] lsu_pending_reads;
|
||||
logic lsu_select_tensor;
|
||||
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rd_req_fire_count;
|
||||
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rsp_fire_count;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_domain_mem
|
||||
assign scalar_lsu_req_valids[i] = scalar_lsu_bus_if[i].req_valid;
|
||||
assign tensor_lsu_req_valids[i] = tensor_lsu_bus_if[i].req_valid;
|
||||
|
||||
assign dcache_bus_if[i].req_valid = lsu_select_tensor ? tensor_lsu_bus_if[i].req_valid : scalar_lsu_bus_if[i].req_valid;
|
||||
assign dcache_bus_if[i].req_data = lsu_select_tensor ? tensor_lsu_bus_if[i].req_data : scalar_lsu_bus_if[i].req_data;
|
||||
assign scalar_lsu_bus_if[i].req_ready = !lsu_select_tensor && dcache_bus_if[i].req_ready;
|
||||
assign tensor_lsu_bus_if[i].req_ready = lsu_select_tensor && dcache_bus_if[i].req_ready;
|
||||
|
||||
assign scalar_lsu_bus_if[i].rsp_valid = !lsu_active_domain && dcache_bus_if[i].rsp_valid;
|
||||
assign scalar_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
|
||||
assign tensor_lsu_bus_if[i].rsp_valid = lsu_active_domain && dcache_bus_if[i].rsp_valid;
|
||||
assign tensor_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
|
||||
assign dcache_bus_if[i].rsp_ready = lsu_active_domain ? tensor_lsu_bus_if[i].rsp_ready : scalar_lsu_bus_if[i].rsp_ready;
|
||||
|
||||
assign lsu_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready;
|
||||
assign lsu_rd_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && !dcache_bus_if[i].req_data.rw;
|
||||
assign lsu_rsp_fires[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
assign scalar_lsu_req_any = |scalar_lsu_req_valids;
|
||||
assign tensor_lsu_req_any = |tensor_lsu_req_valids;
|
||||
|
||||
always @(*) begin
|
||||
if (lsu_pending_reads != 0) begin
|
||||
lsu_select_tensor = lsu_active_domain;
|
||||
end else if (scalar_lsu_req_any && tensor_lsu_req_any) begin
|
||||
lsu_select_tensor = lsu_domain_rr;
|
||||
end else begin
|
||||
lsu_select_tensor = tensor_lsu_req_any;
|
||||
end
|
||||
|
||||
lsu_rd_req_fire_count = '0;
|
||||
lsu_rsp_fire_count = '0;
|
||||
for (integer i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
lsu_rd_req_fire_count = lsu_rd_req_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rd_req_fires[i]);
|
||||
lsu_rsp_fire_count = lsu_rsp_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rsp_fires[i]);
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
lsu_domain_rr <= 1'b0;
|
||||
lsu_active_domain <= 1'b0;
|
||||
lsu_pending_reads <= '0;
|
||||
end else begin
|
||||
if (lsu_pending_reads == 0 && (|lsu_req_fires)) begin
|
||||
lsu_domain_rr <= ~lsu_select_tensor;
|
||||
if (lsu_rd_req_fire_count != 0) begin
|
||||
lsu_active_domain <= lsu_select_tensor;
|
||||
end
|
||||
end
|
||||
lsu_pending_reads <= lsu_pending_reads + 16'(lsu_rd_req_fire_count) - 16'(lsu_rsp_fire_count);
|
||||
end
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!(lsu_pending_reads == 0 && (|lsu_rsp_fires)),
|
||||
("%t: *** core%0d-lsu-domain-arb-unmatched-response", $time, CORE_ID)
|
||||
)
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs_r;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs_r;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls_r;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls_r;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls_r;
|
||||
|
||||
wire scalar_lsu_req_fire_any = (|lsu_req_fires) && !lsu_select_tensor;
|
||||
wire tensor_lsu_req_fire_any = (|lsu_req_fires) && lsu_select_tensor;
|
||||
wire scalar_lsu_merge_stall = scalar_lsu_req_any && lsu_select_tensor;
|
||||
wire tensor_lsu_merge_stall = tensor_lsu_req_any && !lsu_select_tensor;
|
||||
wire mem_merge_stall = scalar_lsu_req_any && tensor_lsu_req_any;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scalar_lsu_reqs_r <= '0;
|
||||
perf_tensor_lsu_reqs_r <= '0;
|
||||
perf_scalar_lsu_stalls_r <= '0;
|
||||
perf_tensor_lsu_stalls_r <= '0;
|
||||
perf_mem_merge_stalls_r <= '0;
|
||||
end else begin
|
||||
perf_scalar_lsu_reqs_r <= perf_scalar_lsu_reqs_r + `PERF_CTR_BITS'(scalar_lsu_req_fire_any);
|
||||
perf_tensor_lsu_reqs_r <= perf_tensor_lsu_reqs_r + `PERF_CTR_BITS'(tensor_lsu_req_fire_any);
|
||||
perf_scalar_lsu_stalls_r <= perf_scalar_lsu_stalls_r + `PERF_CTR_BITS'(scalar_lsu_merge_stall);
|
||||
perf_tensor_lsu_stalls_r <= perf_tensor_lsu_stalls_r + `PERF_CTR_BITS'(tensor_lsu_merge_stall);
|
||||
perf_mem_merge_stalls_r <= perf_mem_merge_stalls_r + `PERF_CTR_BITS'(mem_merge_stall);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_scalar_lsu_reqs = perf_scalar_lsu_reqs_r;
|
||||
assign perf_tensor_lsu_reqs = perf_tensor_lsu_reqs_r;
|
||||
assign perf_scalar_lsu_stalls = perf_scalar_lsu_stalls_r;
|
||||
assign perf_tensor_lsu_stalls = perf_tensor_lsu_stalls_r;
|
||||
assign perf_mem_merge_stalls = perf_mem_merge_stalls_r;
|
||||
`endif
|
||||
`else
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_commit_passthru
|
||||
assign lsu_commit_if[i].valid = lsu_scalar_commit_if[i].valid;
|
||||
assign lsu_commit_if[i].data = lsu_scalar_commit_if[i].data;
|
||||
assign lsu_scalar_commit_if[i].ready = lsu_commit_if[i].ready;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_mem_passthru
|
||||
`ASSIGN_VX_MEM_BUS_IF(dcache_bus_if[i], scalar_lsu_bus_if[i]);
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_scalar_lsu_reqs = '0;
|
||||
assign perf_tensor_lsu_reqs = '0;
|
||||
assign perf_scalar_lsu_stalls = '0;
|
||||
assign perf_tensor_lsu_stalls = '0;
|
||||
assign perf_mem_merge_stalls = '0;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`RESET_RELAY (fpu_reset, reset);
|
||||
|
||||
@@ -147,7 +513,11 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
`ifdef EXT_T_ENABLE
|
||||
.warp_ctl_if (scalar_warp_ctl_if),
|
||||
`else
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
`endif
|
||||
.commit_if (sfu_commit_if),
|
||||
|
||||
.acc_read_in (acc_read_in),
|
||||
@@ -156,8 +526,27 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
);
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_commit_if tensor_core_commit_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if tensor_ctrl_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_tensor_ctrl_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tensor_ctrl_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.dispatch_if (tensor_ctrl_dispatch_if),
|
||||
.commit_if (tensor_ctrl_commit_if),
|
||||
.warp_ctl_if (tensor_warp_ctl_if),
|
||||
.csr_unlock_valid (tensor_csr_unlock_valid),
|
||||
.csr_unlock_wid (tensor_csr_unlock_wid),
|
||||
.tmc_valid (tensor_tmc_valid),
|
||||
.tmc_wid (tensor_tmc_wid),
|
||||
.tmc_tmask (tensor_tmc_tmask)
|
||||
);
|
||||
|
||||
VX_tensor_core #(
|
||||
.FP16 (TENSOR_FP16)
|
||||
.FP16 (TENSOR_FP16),
|
||||
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
|
||||
) tensor_core (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
@@ -166,17 +555,44 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
`ifdef EXT_T_ASYNC
|
||||
.regfile_if(tensor_regfile_if),
|
||||
.smem_A_if(tensor_smem_A_if),
|
||||
.tmem_C_wen(tensor_tmem_C_wen),
|
||||
.tmem_A_ren(tensor_tmem_A_ren),
|
||||
.tmem_A_rready(tensor_tmem_A_rready),
|
||||
.tmem_A_raddr(tensor_tmem_A_raddr),
|
||||
.tmem_A_rdata(tensor_tmem_A_rdata),
|
||||
.tmem_C_ren(tensor_tmem_C_ren),
|
||||
.tmem_C_waddr(tensor_tmem_C_waddr),
|
||||
.tmem_C_rready(tensor_tmem_C_rready),
|
||||
.tmem_C_raddr(tensor_tmem_C_raddr),
|
||||
.tmem_C_rdata(tensor_tmem_C_rdata),
|
||||
.tmem_C_wen(tensor_tmem_C_wen),
|
||||
.tmem_C_wready(tensor_tmem_C_wready),
|
||||
.tmem_C_waddr(tensor_tmem_C_waddr),
|
||||
.tmem_C_wdata(tensor_tmem_C_wdata),
|
||||
.tmem_C_mask(tensor_tmem_C_mask),
|
||||
.tmem_C_rdata(tensor_tmem_C_rdata),
|
||||
.smem_B_if(tensor_smem_B_if),
|
||||
`endif
|
||||
.commit_if(tensor_commit_if)
|
||||
.commit_if(tensor_core_commit_if)
|
||||
);
|
||||
|
||||
localparam TENSOR_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_commit_arb
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (TENSOR_COMMIT_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
) tensor_commit_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({tensor_ctrl_commit_if[i].valid, tensor_core_commit_if[i].valid}),
|
||||
.ready_in ({tensor_ctrl_commit_if[i].ready, tensor_core_commit_if[i].ready}),
|
||||
.data_in ({tensor_ctrl_commit_if[i].data, tensor_core_commit_if[i].data}),
|
||||
.data_out (tensor_commit_if[i].data),
|
||||
.valid_out (tensor_commit_if[i].valid),
|
||||
.ready_out (tensor_commit_if[i].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
end
|
||||
`endif
|
||||
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
|
||||
Reference in New Issue
Block a user