Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

172
hw/rtl/core/VX_alu_unit.sv Normal file
View File

@@ -0,0 +1,172 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_alu_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS]
);
`UNUSED_PARAM (CORE_ID)
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
localparam NUM_LANES = `NUM_ALU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 1 : 0)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire is_muldiv_op;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) int_execute_if();
assign int_execute_if.valid = execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = execute_if[block_idx].data;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) int_commit_if();
`RESET_RELAY (int_reset, reset);
VX_int_unit #(
.CORE_ID (CORE_ID),
.BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES)
) int_unit (
.clk (clk),
.reset (int_reset),
.execute_if (int_execute_if),
.branch_ctl_if (branch_ctl_if[block_idx]),
.commit_if (int_commit_if)
);
`ifdef EXT_M_ENABLE
assign is_muldiv_op = `INST_ALU_IS_M(execute_if[block_idx].data.op_mod);
`RESET_RELAY (mdv_reset, reset);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) mdv_execute_if();
assign mdv_execute_if.valid = execute_if[block_idx].valid && is_muldiv_op;
assign mdv_execute_if.data = execute_if[block_idx].data;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) mdv_commit_if();
VX_muldiv_unit #(
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) mdv_unit (
.clk (clk),
.reset (mdv_reset),
.execute_if (mdv_execute_if),
.commit_if (mdv_commit_if)
);
assign execute_if[block_idx].ready = is_muldiv_op ? mdv_execute_if.ready : int_execute_if.ready;
`else
assign is_muldiv_op = 0;
assign execute_if[block_idx].ready = int_execute_if.ready;
`endif
// send response
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (PARTIAL_BW ? 1 : 3)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.valid,
`endif
int_commit_if.valid
}),
.ready_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.ready,
`endif
int_commit_if.ready
}),
.data_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.data,
`endif
int_commit_if.data
}),
.data_out (commit_block_if[block_idx].data),
.valid_out (commit_block_if[block_idx].valid),
.ready_out (commit_block_if[block_idx].ready),
`UNUSED_PIN (sel_out)
);
end
`RESET_RELAY (commit_reset, reset);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 3 : 0)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_block_if),
.commit_out_if (commit_if)
);
endmodule

226
hw/rtl/core/VX_commit.sv Normal file
View File

@@ -0,0 +1,226 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_commit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave alu_commit_if [`ISSUE_WIDTH],
VX_commit_if.slave lsu_commit_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_commit_if.slave sfu_commit_if [`ISSUE_WIDTH],
// outputs
VX_writeback_if.master writeback_if [`ISSUE_WIDTH],
VX_commit_csr_if.master commit_csr_if,
VX_commit_sched_if.master commit_sched_if,
// simulation helper signals
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
// commit arbitration
VX_commit_if commit_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (`NUM_EX_UNITS),
.DATAW (DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) commit_arb (
.clk (clk),
.reset (arb_reset),
.valid_in ({
sfu_commit_if[i].valid,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].valid,
`endif
alu_commit_if[i].valid,
lsu_commit_if[i].valid
}),
.ready_in ({
sfu_commit_if[i].ready,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].ready,
`endif
alu_commit_if[i].ready,
lsu_commit_if[i].ready
}),
.data_in ({
sfu_commit_if[i].data,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].data,
`endif
alu_commit_if[i].data,
lsu_commit_if[i].data
}),
.data_out (commit_if[i].data),
.valid_out (commit_if[i].valid),
.ready_out (commit_if[i].ready),
`UNUSED_PIN (sel_out)
);
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop;
end
// CSRs update
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] pop_count;
`POP_COUNT(pop_count, commit_tmask[i]);
assign commit_size[i] = pop_count;
end
VX_pipe_register #(
.DATAW (1 + `ISSUE_WIDTH * COMMIT_SIZEW),
.RESETW (1)
) commit_size_reg1 (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire_any, commit_size}),
.data_out ({commit_fire_any_r, commit_size_r})
);
VX_reduce #(
.DATAW_IN (COMMIT_SIZEW),
.DATAW_OUT (COMMIT_ALL_SIZEW),
.N (`ISSUE_WIDTH),
.OP ("+")
) commit_size_reduce (
.data_in (commit_size_r),
.data_out (commit_size_all)
);
VX_pipe_register #(
.DATAW (1 + COMMIT_ALL_SIZEW),
.RESETW (1)
) commit_size_reg2 (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire_any_r, commit_size_all}),
.data_out ({commit_fire_any_rr, commit_size_all_r})
);
reg [`PERF_CTR_BITS-1:0] instret;
always @(posedge clk) begin
if (reset) begin
instret <= '0;
end else begin
if (commit_fire_any_rr) begin
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
end
end
end
assign commit_csr_if.instret = instret;
// Committed instructions
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH)
) committed_pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({(commit_fire & commit_eop), commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
);
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC;
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd;
assign writeback_if[i].data.data = commit_if[i].data.data;
assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1;
end
// simulation helper signal to get RISC-V tests Pass/Fail status
reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r;
always @(posedge clk) begin
if (writeback_if[0].valid) begin
sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0];
end
end
assign sim_wb_value = sim_wb_value_r;
`ifdef DBG_TRACE_CORE_PIPELINE
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (alu_commit_if[i].valid && alu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.tmask, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, alu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", alu_commit_if[i].data.uuid));
end
if (lsu_commit_if[i].valid && lsu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, lsu_commit_if[i].data.wid, lsu_commit_if[i].data.PC, lsu_commit_if[i].data.tmask, lsu_commit_if[i].data.wb, lsu_commit_if[i].data.rd, lsu_commit_if[i].data.sop, lsu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, lsu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", lsu_commit_if[i].data.uuid));
end
`ifdef EXT_F_ENABLE
if (fpu_commit_if[i].valid && fpu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, fpu_commit_if[i].data.wid, fpu_commit_if[i].data.PC, fpu_commit_if[i].data.tmask, fpu_commit_if[i].data.wb, fpu_commit_if[i].data.rd, fpu_commit_if[i].data.sop, fpu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, fpu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", fpu_commit_if[i].data.uuid));
end
`endif
if (sfu_commit_if[i].valid && sfu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=SFU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, sfu_commit_if[i].data.wid, sfu_commit_if[i].data.PC, sfu_commit_if[i].data.tmask, sfu_commit_if[i].data.wb, sfu_commit_if[i].data.rd, sfu_commit_if[i].data.sop, sfu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, sfu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", sfu_commit_if[i].data.uuid));
end
end
end
`endif
endmodule

336
hw/rtl/core/VX_core.sv Normal file
View File

@@ -0,0 +1,336 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifdef EXT_F_ENABLE
`include "VX_fpu_define.vh"
`endif
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master icache_bus_if,
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
VX_schedule_if schedule_if();
VX_fetch_if fetch_if();
VX_decode_if decode_if();
VX_sched_csr_if sched_csr_if();
VX_decode_sched_if decode_sched_if();
VX_commit_sched_if commit_sched_if();
VX_commit_csr_if commit_csr_if();
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
`ifdef EXT_F_ENABLE
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
`endif
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
`ifdef PERF_ENABLE
VX_pipeline_perf_if pipeline_perf_if();
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
`ifdef SM_ENABLE
cache_perf_t smem_perf;
assign mem_perf_tmp_if.smem = smem_perf;
`else
assign mem_perf_tmp_if.smem = '0;
`endif
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`RESET_RELAY (dcr_data_reset, reset);
`RESET_RELAY (schedule_reset, reset);
`RESET_RELAY (fetch_reset, reset);
`RESET_RELAY (decode_reset, reset);
`RESET_RELAY (issue_reset, reset);
`RESET_RELAY (execute_reset, reset);
`RESET_RELAY (commit_reset, reset);
base_dcrs_t base_dcrs;
VX_dcr_data dcr_data (
.clk (clk),
.reset (dcr_data_reset),
.dcr_bus_if (dcr_bus_if),
.base_dcrs (base_dcrs)
);
`SCOPE_IO_SWITCH (3)
VX_schedule #(
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
.base_dcrs (base_dcrs),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.decode_sched_if(decode_sched_if),
.commit_sched_if(commit_sched_if),
.schedule_if (schedule_if),
`ifdef GBAR_ENABLE
.gbar_bus_if (gbar_bus_if),
`endif
.sched_csr_if (sched_csr_if),
.busy (busy)
);
VX_fetch #(
.CORE_ID (CORE_ID)
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (fetch_reset),
.icache_bus_if (icache_bus_if),
.schedule_if (schedule_if),
.fetch_if (fetch_if)
);
VX_decode #(
.CORE_ID (CORE_ID)
) decode (
.clk (clk),
.reset (decode_reset),
.fetch_if (fetch_if),
.decode_if (decode_if),
.decode_sched_if(decode_sched_if)
);
VX_issue #(
.CORE_ID (CORE_ID)
) issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
VX_execute #(
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
.clk (clk),
.reset (execute_reset),
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
.dcache_bus_if (dcache_bus_tmp_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
.fpu_commit_if (fpu_commit_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
.sfu_dispatch_if(sfu_dispatch_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
.sfu_commit_if (sfu_commit_if),
.sim_ebreak (sim_ebreak)
);
VX_commit #(
.CORE_ID (CORE_ID)
) commit (
.clk (clk),
.reset (commit_reset),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.sfu_commit_if (sfu_commit_if),
.writeback_if (writeback_if),
.commit_csr_if (commit_csr_if),
.commit_sched_if(commit_sched_if),
.sim_wb_value (sim_wb_value)
);
`ifdef SM_ENABLE
VX_smem_unit #(
.CORE_ID (CORE_ID)
) smem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.cache_perf (smem_perf),
`endif
.dcache_bus_in_if (dcache_bus_tmp_if),
.dcache_bus_out_if (dcache_bus_if)
);
`else
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
end
`endif
`ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores;
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_icache_pending_reads <= '0;
perf_dcache_pending_reads <= '0;
end else begin
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
end
end
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
always @(posedge clk) begin
if (reset) begin
perf_ifetches <= '0;
perf_loads <= '0;
perf_stores <= '0;
perf_icache_lat <= '0;
perf_dcache_lat <= '0;
end else begin
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
end
end
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
`endif
endmodule

168
hw/rtl/core/VX_core_top.sv Normal file
View File

@@ -0,0 +1,168 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifdef EXT_F_ENABLE
`include "VX_fpu_define.vh"
`endif
module VX_core_top import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
// Clock
input wire clk,
input wire reset,
input wire dcr_write_valid,
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_write_addr,
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_write_data,
output wire [DCACHE_NUM_REQS-1:0] dcache_req_valid,
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
input wire [DCACHE_NUM_REQS-1:0] dcache_rsp_valid,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_rsp_data,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_rsp_tag,
output wire [DCACHE_NUM_REQS-1:0] dcache_rsp_ready,
output wire icache_req_valid,
output wire icache_req_rw,
output wire [ICACHE_WORD_SIZE-1:0] icache_req_byteen,
output wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr,
output wire [ICACHE_WORD_SIZE*8-1:0] icache_req_data,
output wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag,
input wire icache_req_ready,
input wire icache_rsp_valid,
input wire [ICACHE_WORD_SIZE*8-1:0] icache_rsp_data,
input wire [ICACHE_TAG_WIDTH-1:0] icache_rsp_tag,
output wire icache_rsp_ready,
`ifdef GBAR_ENABLE
output wire gbar_req_valid,
output wire [`NB_WIDTH-1:0] gbar_req_id,
output wire [`NC_WIDTH-1:0] gbar_req_size_m1,
output wire [`NC_WIDTH-1:0] gbar_req_core_id,
input wire gbar_req_ready,
input wire gbar_rsp_valid,
input wire [`NB_WIDTH-1:0] gbar_rsp_id,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
`ifdef GBAR_ENABLE
VX_gbar_bus_if gbar_bus_if();
assign gbar_req_valid = gbar_bus_if.req_valid;
assign gbar_req_id = gbar_bus_if.req_id;
assign gbar_req_size_m1 = gbar_bus_if.req_size_m1;
assign gbar_req_core_id = gbar_bus_if.req_core_id;
assign gbar_bus_if.req_ready = gbar_req_ready;
assign gbar_bus_if.rsp_valid = gbar_rsp_valid;
assign gbar_bus_if.rsp_id = gbar_rsp_id;
`endif
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_write_valid;
assign dcr_bus_if.write_addr = dcr_write_addr;
assign dcr_bus_if.write_data = dcr_write_data;
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) dcache_bus_if[DCACHE_NUM_REQS]();
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign dcache_req_valid[i] = dcache_bus_if[i].req_valid;
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
assign dcache_bus_if[i].rsp_valid = dcache_rsp_valid[i];
assign dcache_bus_if[i].rsp_data.tag = dcache_rsp_tag[i];
assign dcache_bus_if[i].rsp_data.data = dcache_rsp_data[i];
assign dcache_rsp_ready[i] = dcache_bus_if[i].rsp_ready;
end
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_WORD_SIZE),
.TAG_WIDTH (ICACHE_TAG_WIDTH)
) icache_bus_if();
assign icache_req_valid = icache_bus_if.req_valid;
assign icache_req_rw = icache_bus_if.req_data.rw;
assign icache_req_byteen = icache_bus_if.req_data.byteen;
assign icache_req_addr = icache_bus_if.req_data.addr;
assign icache_req_data = icache_bus_if.req_data.data;
assign icache_req_tag = icache_bus_if.req_data.tag;
assign icache_bus_if.req_ready = icache_req_ready;
assign icache_bus_if.rsp_valid = icache_rsp_valid;
assign icache_bus_if.rsp_data.tag = icache_rsp_tag;
assign icache_bus_if.rsp_data.data = icache_rsp_data;
assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
`endif
`ifdef SCOPE
wire [0:0] scope_reset_w = 1'b0;
wire [0:0] scope_bus_in_w = 1'b0;
wire [0:0] scope_bus_out_w;
`UNUSED_VAR (scope_bus_out_w)
`endif
VX_core #(
.CORE_ID (CORE_ID)
) core (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
`endif
.dcr_bus_if (dcr_bus_if),
.dcache_bus_if (dcache_bus_if),
.icache_bus_if (icache_bus_if),
`ifdef GBAR_ENABLE
.gbar_bus_if (gbar_bus_if),
`endif
.sim_ebreak (sim_ebreak),
.sim_wb_value (sim_wb_value),
.busy (busy)
);
endmodule

306
hw/rtl/core/VX_csr_data.sv Normal file
View File

@@ -0,0 +1,306 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifdef EXT_F_ENABLE
`include "VX_fpu_define.vh"
`endif
module VX_csr_data
import VX_gpu_pkg::*;
`ifdef EXT_F_ENABLE
import VX_fpu_pkg::*;
`endif
#(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif
VX_commit_csr_if.slave commit_csr_if,
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS],
`endif
input wire [`PERF_CTR_BITS-1:0] cycles,
input wire [`NUM_WARPS-1:0] active_warps,
input wire [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks,
input wire read_enable,
input wire [`UUID_WIDTH-1:0] read_uuid,
input wire [`NW_WIDTH-1:0] read_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] read_addr,
output wire [31:0] read_data_ro,
output wire [31:0] read_data_rw,
input wire write_enable,
input wire [`UUID_WIDTH-1:0] write_uuid,
input wire [`NW_WIDTH-1:0] write_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] write_addr,
input wire [31:0] write_data
);
`UNUSED_VAR (reset)
`UNUSED_VAR (write_wid)
`UNUSED_VAR (write_data)
// CSRs Write /////////////////////////////////////////////////////////////
`ifdef EXT_F_ENABLE
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n;
wire [`NUM_FPU_BLOCKS-1:0] fpu_write_enable;
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_write_enable[i] = fpu_to_csr_if[i].write_enable;
assign fpu_write_wid[i] = fpu_to_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_to_csr_if[i].write_fflags;
end
always @(*) begin
fcsr_n = fcsr;
for (integer i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
if (fpu_write_enable[i]) begin
fcsr_n[fpu_write_wid[i]][`FP_FLAGS_BITS-1:0] = fcsr[fpu_write_wid[i]][`FP_FLAGS_BITS-1:0]
| fpu_write_fflags[i];
end
end
if (write_enable) begin
case (write_addr)
`VX_CSR_FFLAGS: fcsr_n[write_wid][`FP_FLAGS_BITS-1:0] = write_data[`FP_FLAGS_BITS-1:0];
`VX_CSR_FRM: fcsr_n[write_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS] = write_data[`INST_FRM_BITS-1:0];
`VX_CSR_FCSR: fcsr_n[write_wid] = write_data[`FP_FLAGS_BITS+`INST_FRM_BITS-1:0];
default:;
endcase
end
end
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_to_csr_if[i].read_frm = fcsr[fpu_to_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
end
always @(posedge clk) begin
if (reset) begin
fcsr <= '0;
end else begin
fcsr <= fcsr_n;
end
end
`endif
always @(posedge clk) begin
if (write_enable) begin
case (write_addr)
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS,
`VX_CSR_FRM,
`VX_CSR_FCSR,
`endif
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
`VX_CSR_MNSTATUS,
`VX_CSR_MEDELEG,
`VX_CSR_MIDELEG,
`VX_CSR_MIE,
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0: /* do nothing!*/;
default: begin
`ASSERT(0, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
end
endcase
end
end
// CSRs read //////////////////////////////////////////////////////////////
reg [31:0] read_data_ro_r;
reg [31:0] read_data_rw_r;
reg read_addr_valid_r;
always @(*) begin
read_data_ro_r = '0;
read_data_rw_r = '0;
read_addr_valid_r = 1;
case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_r = 32'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = 32'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = 32'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_r = (((`CLOG2(`XLEN)-4) << (`XLEN-2)) | `MISA_STD);
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS : read_data_rw_r = 32'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = 32'(fcsr[read_wid]);
`endif
`VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID);
`VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]);
`VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_r = 32'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_r = 32'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_MCYCLE : read_data_ro_r = 32'(cycles[31:0]);
`VX_CSR_MCYCLE_H : read_data_ro_r = 32'(cycles[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`VX_CSR_MINSTRET : read_data_ro_r = 32'(commit_csr_if.instret[31:0]);
`VX_CSR_MINSTRET_H : read_data_ro_r = 32'(commit_csr_if.instret[`PERF_CTR_BITS-1:32]);
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
`VX_CSR_MNSTATUS,
`VX_CSR_MEDELEG,
`VX_CSR_MIDELEG,
`VX_CSR_MIE,
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0 : read_data_ro_r = 32'(0);
default: begin
read_addr_valid_r = 0;
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
read_addr_valid_r = 1;
`ifdef PERF_ENABLE
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
`ifdef EXT_F_ENABLE
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0];
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
`else
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0;
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
`endif
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
// PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
// PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l2cache
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l3cache
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
default:;
endcase
`endif
end
end
endcase
end
assign read_data_ro = read_data_ro_r;
assign read_data_rw = read_data_rw_r;
`UNUSED_VAR (base_dcrs)
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
`UNUSED_VAR (perf_wctl_stalls);
`UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.smem);
`endif
endmodule

181
hw/rtl/core/VX_csr_unit.sv Normal file
View File

@@ -0,0 +1,181 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_csr_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS],
`endif
VX_commit_csr_if.slave commit_csr_if,
VX_sched_csr_if.slave sched_csr_if,
VX_execute_if.slave execute_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
reg [NUM_LANES-1:0][31:0] csr_read_data;
reg [31:0] csr_write_data;
wire [31:0] csr_read_data_ro, csr_read_data_rw;
wire [31:0] csr_req_data;
reg csr_rd_enable;
wire csr_wr_enable;
wire csr_req_ready;
// wait for all pending instructions to complete
assign sched_csr_if.alm_empty_wid = execute_if.data.wid;
wire no_pending_instr = sched_csr_if.alm_empty;
wire csr_req_valid = execute_if.valid && no_pending_instr;
assign execute_if.ready = csr_req_ready && no_pending_instr;
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.imm[`VX_CSR_ADDR_BITS-1:0];
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.imm[`VX_CSR_ADDR_BITS +: `NRI_BITS];
wire [NUM_LANES-1:0][31:0] rs1_data;
`UNUSED_VAR (rs1_data)
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rs1_data[i] = execute_if.data.rs1_data[i][31:0];
end
wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW);
VX_csr_data #(
.CORE_ID (CORE_ID)
) csr_data (
.clk (clk),
.reset (reset),
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif
.commit_csr_if (commit_csr_if),
.cycles (sched_csr_if.cycles),
.active_warps (sched_csr_if.active_warps),
.thread_masks (sched_csr_if.thread_masks),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
.read_enable (csr_req_valid && csr_rd_enable),
.read_uuid (execute_if.data.uuid),
.read_wid (execute_if.data.wid),
.read_addr (csr_addr),
.read_data_ro (csr_read_data_ro),
.read_data_rw (csr_read_data_rw),
.write_enable (csr_req_valid && csr_wr_enable),
.write_uuid (execute_if.data.uuid),
.write_wid (execute_if.data.wid),
.write_addr (csr_addr),
.write_data (csr_write_data)
);
// CSR read
wire [NUM_LANES-1:0][31:0] wtid, gtid;
for (genvar i = 0; i < NUM_LANES; ++i) begin
if (PID_BITS != 0) begin
assign wtid[i] = 32'(execute_if.data.pid * NUM_LANES + i);
end else begin
assign wtid[i] = 32'(i);
end
assign gtid[i] = (32'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (32'(execute_if.data.wid) << `NT_BITS) + wtid[i];
end
always @(*) begin
csr_rd_enable = 0;
case (csr_addr)
`VX_CSR_THREAD_ID : csr_read_data = wtid;
`VX_CSR_MHARTID : csr_read_data = gtid;
default : begin
csr_read_data = {NUM_LANES{csr_read_data_ro | csr_read_data_rw}};
csr_rd_enable = 1;
end
endcase
end
// CSR write
assign csr_req_data = execute_if.data.use_imm ? 32'(csr_imm) : rs1_data[0];
assign csr_wr_enable = (csr_write_enable || (| csr_req_data));
always @(*) begin
case (execute_if.data.op_type)
`INST_SFU_CSRRW: begin
csr_write_data = csr_req_data;
end
`INST_SFU_CSRRS: begin
csr_write_data = csr_read_data_rw | csr_req_data;
end
//`INST_SFU_CSRRC
default: begin
csr_write_data = csr_read_data_rw & ~csr_req_data;
end
endcase
end
// unlock the warp
assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop;
assign sched_csr_if.unlock_wid = execute_if.data.wid;
// send response
wire [NUM_LANES-1:0][31:0] csr_commit_data;
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (csr_req_valid),
.ready_in (csr_req_ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = `XLEN'(csr_commit_data[i]);
end
endmodule

View File

@@ -0,0 +1,57 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_dcr_data import VX_gpu_pkg::*; (
input wire clk,
input wire reset,
// Inputs
VX_dcr_bus_if.slave dcr_bus_if,
// Outputs
output base_dcrs_t base_dcrs
);
`UNUSED_VAR (reset)
base_dcrs_t dcrs;
always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin
case (dcr_bus_if.write_addr)
`VX_DCR_BASE_STARTUP_ADDR0 : dcrs.startup_addr[31:0] <= dcr_bus_if.write_data;
`ifdef XLEN_64
`VX_DCR_BASE_STARTUP_ADDR1 : dcrs.startup_addr[63:32] <= dcr_bus_if.write_data;
`endif
`VX_DCR_BASE_MPM_CLASS : dcrs.mpm_class <= dcr_bus_if.write_data[7:0];
default:;
endcase
end
end
assign base_dcrs = dcrs;
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin
`TRACE(1, ("%d: base-dcr: state=", $time));
trace_base_dcr(1, dcr_bus_if.write_addr);
`TRACE(1, (", data=0x%0h\n", dcr_bus_if.write_data));
end
end
`endif
endmodule

552
hw/rtl/core/VX_decode.sv Normal file
View File

@@ -0,0 +1,552 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
`ifdef EXT_F_ENABLE
`define USED_IREG(x) \
x``_r = {1'b0, ``x}; \
use_``x = 1
`define USED_FREG(x) \
x``_r = {1'b1, ``x}; \
use_``x = 1
`else
`define USED_IREG(x) \
x``_r = ``x; \
use_``x = 1
`endif
module VX_decode #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_fetch_if.slave fetch_if,
// outputs
VX_decode_if.master decode_if,
VX_decode_sched_if.master decode_sched_if
);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1;
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
reg [`INST_MOD_BITS-1:0] op_mod;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [`XLEN-1:0] imm;
reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm;
reg is_wstall;
wire [31:0] instr = fetch_if.data.instr;
wire [6:0] opcode = instr[6:0];
wire [1:0] func2 = instr[26:25];
wire [2:0] func3 = instr[14:12];
wire [4:0] func5 = instr[31:27];
wire [6:0] func7 = instr[31:25];
wire [11:0] u_12 = instr[31:20];
wire [4:0] rd = instr[11:7];
wire [4:0] rs1 = instr[19:15];
wire [4:0] rs2 = instr[24:20];
wire [4:0] rs3 = instr[31:27];
`UNUSED_VAR (func2)
`UNUSED_VAR (func5)
`UNUSED_VAR (rs3)
`UNUSED_VAR (use_rd)
`UNUSED_VAR (use_rs1)
`UNUSED_VAR (use_rs2)
`UNUSED_VAR (use_rs3)
wire is_itype_sh = func3[0] && ~func3[1];
wire [19:0] ui_imm = instr[31:12];
`ifdef XLEN_64
wire [11:0] i_imm = is_itype_sh ? {6'b0, instr[25:20]} : u_12;
wire [11:0] iw_imm = is_itype_sh ? {7'b0, instr[24:20]} : u_12;
`else
wire [11:0] i_imm = is_itype_sh ? {7'b0, instr[24:20]} : u_12;
`endif
wire [11:0] s_imm = {func7, rd};
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
reg [`INST_ALU_BITS-1:0] r_type;
always @(*) begin
case (func3)
3'h0: r_type = (opcode[5] && func7[5]) ? `INST_ALU_SUB : `INST_ALU_ADD;
3'h1: r_type = `INST_ALU_SLL;
3'h2: r_type = `INST_ALU_SLT;
3'h3: r_type = `INST_ALU_SLTU;
3'h4: r_type = `INST_ALU_XOR;
3'h5: r_type = func7[5] ? `INST_ALU_SRA : `INST_ALU_SRL;
3'h6: r_type = `INST_ALU_OR;
3'h7: r_type = `INST_ALU_AND;
endcase
end
reg [`INST_BR_BITS-1:0] b_type;
always @(*) begin
case (func3)
3'h0: b_type = `INST_BR_EQ;
3'h1: b_type = `INST_BR_NE;
3'h4: b_type = `INST_BR_LT;
3'h5: b_type = `INST_BR_GE;
3'h6: b_type = `INST_BR_LTU;
3'h7: b_type = `INST_BR_GEU;
default: b_type = 'x;
endcase
end
reg [`INST_BR_BITS-1:0] s_type;
always @(*) begin
case (u_12)
12'h000: s_type = `INST_OP_BITS'(`INST_BR_ECALL);
12'h001: s_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h002: s_type = `INST_OP_BITS'(`INST_BR_URET);
12'h102: s_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h302: s_type = `INST_OP_BITS'(`INST_BR_MRET);
default: s_type = 'x;
endcase
end
`ifdef EXT_M_ENABLE
reg [`INST_M_BITS-1:0] m_type;
always @(*) begin
case (func3)
3'h0: m_type = `INST_M_MUL;
3'h1: m_type = `INST_M_MULH;
3'h2: m_type = `INST_M_MULHSU;
3'h3: m_type = `INST_M_MULHU;
3'h4: m_type = `INST_M_DIV;
3'h5: m_type = `INST_M_DIVU;
3'h6: m_type = `INST_M_REM;
3'h7: m_type = `INST_M_REMU;
endcase
end
`endif
always @(*) begin
ex_type = '0;
op_type = 'x;
op_mod = '0;
rd_r = '0;
rs1_r = '0;
rs2_r = '0;
rs3_r = '0;
imm = 'x;
use_imm = 0;
use_PC = 0;
use_rd = 0;
use_rs1 = 0;
use_rs2 = 0;
use_rs3 = 0;
is_wstall = 0;
case (opcode)
`INST_I: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type);
use_rd = 1;
use_imm = 1;
imm = {{(`XLEN-12){i_imm[11]}}, i_imm};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_R: begin
ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE
if (func7[0]) begin
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
`endif
begin
op_type = `INST_OP_BITS'(r_type);
end
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`ifdef XLEN_64
`INST_I_W: begin
// ADDIW, SLLIW, SRLIW, SRAIW
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type);
op_mod[2] = 1;
use_rd = 1;
use_imm = 1;
imm = {{(`XLEN-12){iw_imm[11]}}, iw_imm};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_R_W: begin
ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE
if (func7[0]) begin
// MULW, DIVW, DIVUW, REMW, REMUW
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
`endif
begin
// ADDW, SUBW, SLLW, SRLW, SRAW
op_type = `INST_OP_BITS'(r_type);
end
op_mod[2] = 1;
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`endif
`INST_LUI: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
use_rd = 1;
use_imm = 1;
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd);
end
`INST_AUIPC: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
use_rd = 1;
use_imm = 1;
use_PC = 1;
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd);
end
`INST_JAL: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JAL);
op_mod[0] = 1;
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = {{(`XLEN-21){jal_imm[20]}}, jal_imm};
`USED_IREG (rd);
end
`INST_JALR: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JALR);
op_mod[0] = 1;
use_rd = 1;
use_imm = 1;
is_wstall = 1;
imm = {{(`XLEN-12){u_12[11]}}, u_12};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_B: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(b_type);
op_mod[0] = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = {{(`XLEN-13){b_imm[12]}}, b_imm};
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`INST_FENCE: begin
ex_type = `EX_LSU;
op_type = `INST_LSU_FENCE;
end
`INST_SYS : begin
if (func3[1:0] != 0) begin
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CSR(func3[1:0]));
use_rd = 1;
is_wstall = 1;
use_imm = func3[2];
imm[`VX_CSR_ADDR_BITS-1:0] = u_12; // addr
`USED_IREG (rd);
if (func3[2]) begin
imm[`VX_CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm
end else begin
`USED_IREG (rs1);
end
end else begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(s_type);
op_mod[0] = 1;
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = `XLEN'd4;
`USED_IREG (rd);
end
end
`ifdef EXT_F_ENABLE
`INST_FL,
`endif
`INST_L: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b0, func3});
use_rd = 1;
imm = {{(`XLEN-12){u_12[11]}}, u_12};
use_imm = 1;
`ifdef EXT_F_ENABLE
if (opcode[2]) begin
`USED_FREG (rd);
end else
`endif
`USED_IREG (rd);
`USED_IREG (rs1);
end
`ifdef EXT_F_ENABLE
`INST_FS,
`endif
`INST_S: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b1, func3});
imm = {{(`XLEN-12){s_imm[11]}}, s_imm};
use_imm = 1;
`USED_IREG (rs1);
`ifdef EXT_F_ENABLE
if (opcode[2]) begin
`USED_FREG (rs2);
end else
`endif
`USED_IREG (rs2);
end
`ifdef EXT_F_ENABLE
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
ex_type = `EX_FPU;
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
op_mod = `INST_MOD_BITS'(func3);
imm[0] = func2[0]; // destination is double?
use_rd = 1;
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
`USED_FREG (rs3);
end
`INST_FCI: begin
ex_type = `EX_FPU;
op_mod = `INST_MOD_BITS'(func3);
`ifdef FLEN_64
imm[0] = func2[0]; // destination is double?
`endif
use_rd = 1;
case (func5)
5'b00000, // FADD
5'b00001, // FSUB
5'b00010, // FMUL
5'b00011: begin // FDIV
op_type = `INST_OP_BITS'(func5[1:0]);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b00100: begin
// NCP: FSGNJ=0, FSGNJN=1, FSGNJX=2
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = `INST_MOD_BITS'(func3[1:0]);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b00101: begin
// NCP: FMIN=6, FMAX=7
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = func3[0] ? 7 : 6;
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
`ifdef FLEN_64
5'b01000: begin
// CVT.S.D, CVT.D.S
op_type = `INST_OP_BITS'(`INST_FPU_F2F);
`USED_FREG (rd);
`USED_FREG (rs1);
end
`endif
5'b01011: begin
// SQRT
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
`USED_FREG (rd);
`USED_FREG (rs1);
end
5'b10100: begin
// CMP
op_type = `INST_OP_BITS'(`INST_FPU_CMP);
`USED_IREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b11000: begin
// CVT.W.X, CVT.WU.X
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_F2U) : `INST_OP_BITS'(`INST_FPU_F2I);
`ifdef XLEN_64
imm[1] = rs2[1]; // is 64-bit integer
`endif
`USED_IREG (rd);
`USED_FREG (rs1);
end
5'b11010: begin
// CVT.X.W, CVT.X.WU
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_U2F) : `INST_OP_BITS'(`INST_FPU_I2F);
`ifdef XLEN_64
imm[1] = rs2[1]; // is 64-bit integer
`endif
`USED_FREG (rd);
`USED_IREG (rs1);
end
5'b11100: begin
if (func3[0]) begin
// NCP: FCLASS=3
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 3;
end else begin
// NCP: FMV.X.W=4
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 4;
end
`USED_IREG (rd);
`USED_FREG (rs1);
end
5'b11110: begin
// NCP: FMV.W.X=5
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 5;
`USED_FREG (rd);
`USED_IREG (rs1);
end
default:;
endcase
end
`endif
`INST_EXT1: begin
case (func7)
7'h00: begin
ex_type = `EX_SFU;
is_wstall = 1;
case (func3)
3'h0: begin // TMC
op_type = `INST_OP_BITS'(`INST_SFU_TMC);
`USED_IREG (rs1);
end
3'h1: begin // WSPAWN
op_type = `INST_OP_BITS'(`INST_SFU_WSPAWN);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h2: begin // SPLIT
op_type = `INST_OP_BITS'(`INST_SFU_SPLIT);
use_rd = 1;
`USED_IREG (rs1);
`USED_IREG (rd);
end
3'h3: begin // JOIN
op_type = `INST_OP_BITS'(`INST_SFU_JOIN);
`USED_IREG (rs1);
end
3'h4: begin // BAR
op_type = `INST_OP_BITS'(`INST_SFU_BAR);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h5: begin // PRED
op_type = `INST_OP_BITS'(`INST_SFU_PRED);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
default:;
endcase
end
default:;
endcase
end
`INST_EXT2: begin
case (func3)
3'h1: begin
case (func2)
2'h0: begin // CMOV
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CMOV);
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`USED_IREG (rs3);
end
default:;
endcase
end
default:;
endcase
end
default:;
endcase
end
// disable write to integer register r0
wire wb = use_rd && (rd_r != 0);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (0)
) req_buf (
.clk (clk),
.reset (reset),
.valid_in (fetch_if.valid),
.ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_mod, use_PC, imm, use_imm, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.use_PC, decode_if.data.imm, decode_if.data.use_imm, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.valid_out (decode_if.valid),
.ready_out (decode_if.ready)
);
///////////////////////////////////////////////////////////////////////////
wire fetch_fire = fetch_if.valid && fetch_if.ready;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
`TRACE(1, ("%d: core%0d-decode: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, CORE_ID, decode_if.data.wid, decode_if.data.PC, instr));
trace_ex_type(1, decode_if.data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.rd, decode_if.data.rs2, decode_if.data.use_imm, decode_if.data.imm);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=0x%0h, opds=%b%b%b%b, use_pc=%b, use_imm=%b (#%0d)\n",
decode_if.data.op_mod, decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, decode_if.data.imm, use_rd, use_rs1, use_rs2, use_rs3, decode_if.data.use_PC, decode_if.data.use_imm, decode_if.data.uuid));
end
end
`endif
endmodule

227
hw/rtl/core/VX_dispatch.sv Normal file
View File

@@ -0,0 +1,227 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
`endif
// inputs
VX_operands_if.slave operands_if [`ISSUE_WIDTH],
// outputs
VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
wire [`ISSUE_WIDTH-1:0][`NT_WIDTH-1:0] last_active_tid;
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tids[i] = `NT_WIDTH'(i);
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_find_first #(
.N (`NUM_THREADS),
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if[i].data.tmask),
.data_in (tids),
.data_out (last_active_tid[i]),
`UNUSED_PIN (valid_out)
);
end
// ALU dispatch
VX_operands_if alu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU);
assign alu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (alu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) alu_buffer (
.clk (clk),
.reset (alu_reset),
.valid_in (alu_operands_if[i].valid),
.ready_in (alu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(alu_operands_if[i].data, last_active_tid[i])),
.data_out (alu_dispatch_if[i].data),
.valid_out (alu_dispatch_if[i].valid),
.ready_out (alu_dispatch_if[i].ready)
);
end
// LSU dispatch
VX_operands_if lsu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU);
assign lsu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (lsu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) lsu_buffer (
.clk (clk),
.reset (lsu_reset),
.valid_in (lsu_operands_if[i].valid),
.ready_in (lsu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(lsu_operands_if[i].data, last_active_tid[i])),
.data_out (lsu_dispatch_if[i].data),
.valid_out (lsu_dispatch_if[i].valid),
.ready_out (lsu_dispatch_if[i].ready)
);
end
// FPU dispatch
`ifdef EXT_F_ENABLE
VX_operands_if fpu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU);
assign fpu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (fpu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) fpu_buffer (
.clk (clk),
.reset (fpu_reset),
.valid_in (fpu_operands_if[i].valid),
.ready_in (fpu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(fpu_operands_if[i].data, last_active_tid[i])),
.data_out (fpu_dispatch_if[i].data),
.valid_out (fpu_dispatch_if[i].valid),
.ready_out (fpu_dispatch_if[i].ready)
);
end
`endif
// SFU dispatch
VX_operands_if sfu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign sfu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU);
assign sfu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (sfu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) sfu_buffer (
.clk (clk),
.reset (sfu_reset),
.valid_in (sfu_operands_if[i].valid),
.ready_in (sfu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(sfu_operands_if[i].data, last_active_tid[i])),
.data_out (sfu_dispatch_if[i].data),
.valid_out (sfu_dispatch_if[i].valid),
.ready_out (sfu_dispatch_if[i].ready)
);
end
// can take next request?
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU))
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU))
`ifdef EXT_F_ENABLE
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU))
`endif
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
end
`ifdef PERF_ENABLE
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r;
wire [`ISSUE_WIDTH-1:0] operands_stall;
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready;
assign operands_ex_type[i] = operands_if[i].data.ex_type;
end
always @(*) begin
perf_stalls_n = perf_stalls_r;
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
if (operands_stall[i]) begin
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
end
end
end
always @(posedge clk) begin
if (reset) begin
perf_stalls_r <= '0;
end else begin
perf_stalls_r <= perf_stalls_n;
end
end
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
assign perf_stalls[i] = perf_stalls_r[i];
end
`endif
`ifdef DBG_TRACE_CORE_PIPELINE
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC));
trace_ex_type(1, operands_if[i].data.ex_type);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
`TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, operands_if[i].data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, operands_if[i].data.rs3_data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid));
end
end
end
`endif
endmodule

View File

@@ -0,0 +1,256 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_dispatch_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1,
parameter OUT_REG = 0,
parameter MAX_FANOUT = `MAX_FANOUT
) (
input wire clk,
input wire reset,
// inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// outputs
VX_execute_if.master execute_if [BLOCK_SIZE]
);
`STATIC_ASSERT ((`NUM_THREADS == NUM_LANES * (`NUM_THREADS / NUM_LANES)), ("invalid parameter"))
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
localparam PID_BITS = `CLOG2(NUM_PACKETS);
localparam PID_WIDTH = `UP(PID_BITS);
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT/2));
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
localparam DATA_REGS_OFF = 0;
wire [`ISSUE_WIDTH-1:0] dispatch_valid;
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_data[i] = dispatch_if[i].data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
wire [BLOCK_SIZE-1:0] block_ready;
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
wire [BLOCK_SIZE-1:0][PID_WIDTH-1:0] block_pid;
wire [BLOCK_SIZE-1:0] block_sop;
wire [BLOCK_SIZE-1:0] block_eop;
wire [BLOCK_SIZE-1:0] block_done;
wire batch_done = (& block_done);
logic [BATCH_COUNT_W-1:0] batch_idx;
if (BATCH_COUNT != 1) begin
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else if (batch_done) begin
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
end
end
end else begin
assign batch_idx = 0;
`UNUSED_VAR (batch_done)
end
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
wire valid_p, ready_p;
if (`NUM_THREADS != NUM_LANES) begin
reg [NUM_PACKETS-1:0] sent_mask_p;
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
wire dispatch_valid_r;
reg is_first_p;
wire fire_p = valid_p && ready_p;
wire is_last_p = (start_p == end_p);
wire fire_eop = fire_p && is_last_p;
always @(posedge clk) begin
if (reset) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else begin
if ((BATCH_COUNT != 1) ? batch_done : fire_eop) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else if (fire_p) begin
sent_mask_p[start_p] <= 1;
is_first_p <= 0;
end
end
end
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar j = 0; j < NUM_LANES; ++j) begin
localparam k = i * NUM_LANES + j;
assign per_packet_tmask[i][j] = dispatch_tmask[k];
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
assign per_packet_regs[i][1][j] = dispatch_rs2_data[k];
assign per_packet_regs[i][2][j] = dispatch_rs3_data[k];
end
end
wire [NUM_PACKETS-1:0] packet_valids;
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
assign packet_valids[i] = (| per_packet_tmask[i]);
assign packet_ids[i] = PID_WIDTH'(i);
end
VX_find_first #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (0)
) find_first (
.valid_in (packet_valids & ~sent_mask_p),
.data_in (packet_ids),
.data_out (start_p_n),
`UNUSED_PIN (valid_out)
);
VX_find_first #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (1)
) find_last (
.valid_in (packet_valids),
.data_in (packet_ids),
.data_out (end_p),
`UNUSED_PIN (valid_out)
);
VX_pipe_register #(
.DATAW (1 + PID_WIDTH),
.RESETW (1),
.DEPTH (FANOUT_ENABLE ? 1 : 0)
) pipe_reg (
.clk (clk),
.reset (reset || fire_p), // should flush on fire
.enable (1'b1),
.data_in ({dispatch_valid[issue_idx], start_p_n}),
.data_out ({dispatch_valid_r, start_p})
);
wire [NUM_LANES-1:0] tmask_p = per_packet_tmask[start_p];
wire [2:0][NUM_LANES-1:0][`XLEN-1:0] regs_p = per_packet_regs[start_p];
wire block_enable = (BATCH_COUNT == 1 || ~(& sent_mask_p));
assign valid_p = dispatch_valid_r && block_enable;
assign block_tmask[block_idx] = tmask_p;
assign block_regs[block_idx] = regs_p;
assign block_pid[block_idx] = start_p;
assign block_sop[block_idx] = is_first_p;
assign block_eop[block_idx] = is_last_p;
if (FANOUT_ENABLE) begin
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
end else begin
assign block_ready[block_idx] = ready_p && block_enable;
end
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
end else begin
assign valid_p = dispatch_valid[issue_idx];
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_pid[block_idx] = '0;
assign block_sop[block_idx] = 1'b1;
assign block_eop[block_idx] = 1'b1;
assign block_ready[block_idx] = ready_p;
assign block_done[block_idx] = ~valid_p || ready_p;
end
wire [ISSUE_IDX_W-1:0] wsi;
if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin
assign wsi = batch_idx;
end
end else begin
assign wsi = block_idx;
end
`RESET_RELAY(buf_out_reset, reset);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi);
VX_elastic_buffer #(
.DATAW (OUT_DATAW),
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)),
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
) buf_out (
.clk (clk),
.reset (buf_out_reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({
dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W],
block_wid,
block_tmask[block_idx],
dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN],
block_regs[block_idx][0],
block_regs[block_idx][1],
block_regs[block_idx][2],
block_pid[block_idx],
block_sop[block_idx],
block_eop[block_idx]}),
.data_out (execute_if[block_idx].data),
.valid_out (execute_if[block_idx].valid),
.ready_out (execute_if[block_idx].ready)
);
end
reg [`ISSUE_WIDTH-1:0] ready_in;
always @(*) begin
ready_in = 0;
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
end
end
assign dispatch_ready = ready_in;
endmodule

137
hw/rtl/core/VX_execute.sv Normal file
View File

@@ -0,0 +1,137 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_execute import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
input base_dcrs_t base_dcrs,
// Dcache interface
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
// commit interface
VX_commit_csr_if.slave commit_csr_if,
// fetch interface
VX_sched_csr_if.slave sched_csr_if,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
`ifdef EXT_F_ENABLE
VX_dispatch_if.slave fpu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH],
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS],
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
VX_dispatch_if.slave sfu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if,
// simulation helper signals
output wire sim_ebreak
);
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS]();
`endif
`RESET_RELAY (alu_reset, reset);
`RESET_RELAY (lsu_reset, reset);
`RESET_RELAY (sfu_reset, reset);
VX_alu_unit #(
.CORE_ID (CORE_ID)
) alu_unit (
.clk (clk),
.reset (alu_reset),
.dispatch_if (alu_dispatch_if),
.branch_ctl_if (branch_ctl_if),
.commit_if (alu_commit_if)
);
`SCOPE_IO_SWITCH (1)
VX_lsu_unit #(
.CORE_ID (CORE_ID)
) lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (lsu_reset),
.cache_bus_if (dcache_bus_if),
.dispatch_if (lsu_dispatch_if),
.commit_if (lsu_commit_if)
);
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #(
.CORE_ID (CORE_ID)
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
.dispatch_if (fpu_dispatch_if),
.fpu_to_csr_if (fpu_to_csr_if),
.commit_if (fpu_commit_if)
);
`endif
VX_sfu_unit #(
.CORE_ID (CORE_ID)
) sfu_unit (
.clk (clk),
.reset (sfu_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),
`endif
.base_dcrs (base_dcrs),
.dispatch_if (sfu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.warp_ctl_if (warp_ctl_if),
.commit_if (sfu_commit_if)
);
// simulation helper signal to get RISC-V tests Pass/Fail status
assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready
&& alu_dispatch_if[0].data.wis == 0
&& `INST_ALU_IS_BR(alu_dispatch_if[0].data.op_mod)
&& (`INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_ECALL);
endmodule

184
hw/rtl/core/VX_fetch.sv Normal file
View File

@@ -0,0 +1,184 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_fetch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
// Icache interface
VX_mem_bus_if.master icache_bus_if,
// inputs
VX_schedule_if.slave schedule_if,
// outputs
VX_fetch_if.master fetch_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
wire icache_req_valid;
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag;
wire icache_req_ready;
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
wire icache_req_fire = icache_req_valid && icache_req_ready;
wire [ISW_WIDTH-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
assign req_tag = schedule_if.data.wid;
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
wire [`XLEN-1:0] rsp_PC;
wire [`NUM_THREADS-1:0] rsp_tmask;
VX_dp_ram #(
.DATAW (`XLEN + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.LUTRAM (1)
) tag_store (
.clk (clk),
.read (1'b1),
.write (icache_req_fire),
`UNUSED_PIN (wren),
.waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.raddr (rsp_tag),
.rdata ({rsp_PC, rsp_tmask})
);
// Ensure that the ibuffer doesn't fill up.
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_pending_size #(
.SIZE (`IBUF_SIZE)
) pending_reads (
.clk (clk),
.reset (reset),
.incr (icache_req_fire && schedule_isw == i),
.decr (fetch_if.ibuf_pop[i]),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
);
end
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
// Icache Request
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
assign icache_req_valid = schedule_if.valid && ibuf_ready;
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
assign icache_req_tag = {schedule_if.data.uuid, req_tag};
assign schedule_if.ready = icache_req_ready && ibuf_ready;
VX_elastic_buffer #(
.DATAW (ICACHE_ADDR_WIDTH + ICACHE_TAG_WIDTH),
.SIZE (2),
.OUT_REG (1) // external bus should be registered
) req_buf (
.clk (clk),
.reset (reset),
.valid_in (icache_req_valid),
.ready_in (icache_req_ready),
.data_in ({icache_req_addr, icache_req_tag}),
.data_out ({icache_bus_if.req_data.addr, icache_bus_if.req_data.tag}),
.valid_out (icache_bus_if.req_valid),
.ready_out (icache_bus_if.req_ready)
);
assign icache_bus_if.req_data.rw = 0;
assign icache_bus_if.req_data.byteen = 4'b1111;
assign icache_bus_if.req_data.data = '0;
// Icache Response
assign fetch_if.valid = icache_bus_if.rsp_valid;
assign fetch_if.data.tmask = rsp_tmask;
assign fetch_if.data.wid = rsp_tag;
assign fetch_if.data.PC = rsp_PC;
assign fetch_if.data.instr = icache_bus_if.rsp_data.data;
assign fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = fetch_if.ready;
`ifdef DBG_SCOPE_FETCH
if (CORE_ID == 0) begin
`ifdef SCOPE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
.TRIGGERW (4),
.PROBEW (3*`UUID_WIDTH + 108)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
schedule_fire,
icache_req_fire,
icache_rsp_fire
}),
.probes({
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_CORE_ICACHE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready;
always @(posedge clk) begin
if (schedule_fire) begin
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, schedule_if.data.PC, schedule_if.data.tmask, schedule_if.data.uuid));
end
if (fetch_fire) begin
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
end
end
`endif
endmodule

View File

@@ -0,0 +1,129 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_gather_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1,
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave commit_in_if [BLOCK_SIZE],
// outputs
VX_commit_if.master commit_out_if [`ISSUE_WIDTH]
);
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH);
wire [BLOCK_SIZE-1:0] commit_in_valid;
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
wire [BLOCK_SIZE-1:0] commit_in_ready;
wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi;
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_valid[i] = commit_in_if[i].valid;
assign commit_in_data[i] = commit_in_if[i].data;
assign commit_in_if[i].ready = commit_in_ready[i];
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
if (BLOCK_SIZE != 1) begin
assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
end else begin
assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W];
end
end else begin
assign commit_in_wsi[i] = BLOCK_SIZE_W'(i);
end
end
reg [`ISSUE_WIDTH-1:0] commit_out_valid;
reg [`ISSUE_WIDTH-1:0][DATAW-1:0] commit_out_data;
wire [`ISSUE_WIDTH-1:0] commit_out_ready;
always @(*) begin
commit_out_valid = '0;
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
commit_out_data[i] = 'x;
end
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i];
commit_out_data[commit_in_wsi[i]] = commit_in_data[i];
end
end
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]];
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_tmp_if();
`RESET_RELAY(commit_out_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)),
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
) out_buf (
.clk (clk),
.reset (commit_out_reset),
.valid_in (commit_out_valid[i]),
.ready_in (commit_out_ready[i]),
.data_in (commit_out_data[i]),
.data_out (commit_tmp_if.data),
.valid_out (commit_tmp_if.valid),
.ready_out (commit_tmp_if.ready)
);
logic [`NUM_THREADS-1:0] commit_tmask_r;
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_r;
if (PID_BITS != 0) begin
always @(*) begin
commit_tmask_r = '0;
commit_data_r = 'x;
for (integer j = 0; j < NUM_LANES; ++j) begin
commit_tmask_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
commit_data_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
end
end
end else begin
assign commit_tmask_r = commit_tmp_if.data.tmask;
assign commit_data_r = commit_tmp_if.data.data;
end
assign commit_out_if[i].valid = commit_tmp_if.valid;
assign commit_out_if[i].data = {
commit_tmp_if.data.uuid,
commit_tmp_if.data.wid,
commit_tmask_r,
commit_tmp_if.data.PC,
commit_tmp_if.data.wb,
commit_tmp_if.data.rd,
commit_data_r,
1'b0, // PID
commit_tmp_if.data.sop,
commit_tmp_if.data.eop
};
assign commit_tmp_if.ready = commit_out_if[i].ready;
end
endmodule

73
hw/rtl/core/VX_ibuffer.sv Normal file
View File

@@ -0,0 +1,73 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if.master ibuffer_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4);
wire [`ISSUE_WIDTH-1:0] ibuf_ready_in;
wire [ISW_WIDTH-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
assign decode_if.ready = ibuf_ready_in[decode_isw];
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (1)
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_isw == i),
.ready_in (ibuf_ready_in[i]),
.data_in ({
decode_if.data.uuid,
decode_wis,
decode_if.data.tmask,
decode_if.data.ex_type,
decode_if.data.op_type,
decode_if.data.op_mod,
decode_if.data.wb,
decode_if.data.use_PC,
decode_if.data.use_imm,
decode_if.data.PC,
decode_if.data.imm,
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3}),
.data_out(ibuffer_if[i].data),
.valid_out (ibuffer_if[i].valid),
.ready_out(ibuffer_if[i].ready)
);
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
end
endmodule

191
hw/rtl/core/VX_int_unit.sv Normal file
View File

@@ -0,0 +1,191 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_int_unit #(
parameter CORE_ID = 0,
parameter BLOCK_IDX = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if,
VX_branch_ctl_if.master branch_ctl_if
);
`UNUSED_PARAM (CORE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam SHIFT_IMM_BITS = `CLOG2(`XLEN);
`UNUSED_VAR (execute_if.data.rs3_data)
wire [NUM_LANES-1:0][`XLEN-1:0] add_result;
wire [NUM_LANES-1:0][`XLEN:0] sub_result; // +1 bit for branch compare
wire [NUM_LANES-1:0][`XLEN-1:0] shr_result;
reg [NUM_LANES-1:0][`XLEN-1:0] msc_result;
wire [NUM_LANES-1:0][`XLEN-1:0] add_result_w;
wire [NUM_LANES-1:0][`XLEN-1:0] sub_result_w;
wire [NUM_LANES-1:0][`XLEN-1:0] shr_result_w;
reg [NUM_LANES-1:0][`XLEN-1:0] msc_result_w;
reg [NUM_LANES-1:0][`XLEN-1:0] alu_result;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_result_r;
`ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod);
`else
wire is_alu_w = 0;
`endif
`UNUSED_VAR (execute_if.data.op_mod)
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(execute_if.data.op_type);
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(execute_if.data.op_type);
wire is_br_op = `INST_ALU_IS_BR(execute_if.data.op_mod);
wire is_sub_op = `INST_ALU_IS_SUB(alu_op);
wire is_signed = `INST_ALU_SIGNED(alu_op);
wire [1:0] op_class = is_br_op ? `INST_BR_CLASS(alu_op) : `INST_ALU_CLASS(alu_op);
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1 = execute_if.data.rs1_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2 = execute_if.data.rs2_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.use_PC ? {NUM_LANES{execute_if.data.PC}} : alu_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.use_imm ? {NUM_LANES{execute_if.data.imm}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.use_imm && ~is_br_op) ? {NUM_LANES{execute_if.data.imm}} : alu_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
assign add_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] + alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] sub_in1 = {is_signed & alu_in1[i][`XLEN-1], alu_in1[i]};
wire [`XLEN:0] sub_in2 = {is_signed & alu_in2_br[i][`XLEN-1], alu_in2_br[i]};
assign sub_result[i] = sub_in1 - sub_in2;
assign sub_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] - alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
assign shr_result[i] = `XLEN'($signed(shr_in1) >>> alu_in2_imm[i][SHIFT_IMM_BITS-1:0]);
wire [32:0] shr_in1_w = {is_signed && alu_in1[i][31], alu_in1[i][31:0]};
wire [31:0] shr_res_w = 32'($signed(shr_in1_w) >>> alu_in2_imm[i][4:0]);
assign shr_result_w[i] = `XLEN'($signed(shr_res_w));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
case (alu_op[1:0])
2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND
2'b01: msc_result[i] = alu_in1[i] | alu_in2_imm[i]; // OR
2'b10: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; // XOR
2'b11: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL
endcase
end
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]});
wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result;
always @(*) begin
case ({is_alu_w, op_class})
3'b000: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
3'b001: alu_result[i] = sub_slt_br_result; // SUB, SLTU, SLTI, BR*
3'b010: alu_result[i] = shr_result[i]; // SRL, SRA, SRLI, SRAI
3'b011: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL, SLLI
3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW
3'b101: alu_result[i] = sub_result_w[i]; // SUBW
3'b110: alu_result[i] = shr_result_w[i]; // SRLW, SRAW, SRLIW, SRAIW
3'b111: alu_result[i] = msc_result_w[i]; // SLLW
endcase
end
end
// branch
wire [`XLEN-1:0] PC_r, imm_r;
wire [`INST_BR_BITS-1:0] br_op_r;
wire [LANE_WIDTH-1:0] tid, tid_r;
wire is_br_op_r;
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin
assign tid = 0;
end
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (execute_if.valid),
.ready_in (execute_if.ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
`UNUSED_VAR (br_op_r)
wire is_br_neg = `INST_BR_IS_NEG(br_op_r);
wire is_br_less = `INST_BR_IS_LESS(br_op_r);
wire is_br_static = `INST_BR_IS_STATIC(br_op_r);
wire [`XLEN-1:0] br_result = alu_result_r[tid_r];
wire is_less = br_result[0];
wire is_equal = br_result[1];
wire br_enable = is_br_op_r && commit_if.valid && commit_if.ready && commit_if.data.eop;
wire br_taken = ((is_br_less ? is_less : is_equal) ^ is_br_neg) | is_br_static;
wire [`XLEN-1:0] br_dest = is_br_static ? br_result : (PC_r + imm_r);
wire [`NW_WIDTH-1:0] br_wid;
`ASSIGN_BLOCKED_WID (br_wid, commit_if.data.wid, BLOCK_IDX, `NUM_ALU_BLOCKS)
VX_pipe_register #(
.DATAW (1 + `NW_WIDTH + 1 + `XLEN)
) branch_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({br_enable, br_wid, br_taken, br_dest}),
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? (PC_r + 4) : alu_result_r[i];
end
assign commit_if.data.PC = PC_r;
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (branch_ctl_if.valid) begin
`TRACE(1, ("%d: core%0d-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, CORE_ID, branch_ctl_if.wid, commit_if.data.PC, branch_ctl_if.taken, branch_ctl_if.dest, commit_if.data.uuid));
end
end
`endif
endmodule

View File

@@ -0,0 +1,108 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
module VX_ipdom_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1,
parameter OUT_REG = 0,
parameter ADDRW = `LOG2UP(DEPTH)
) (
input wire clk,
input wire reset,
input wire [WIDTH-1:0] q0,
input wire [WIDTH-1:0] q1,
output wire [WIDTH-1:0] d,
output wire d_set,
input wire push,
input wire pop,
output wire empty,
output wire full
);
reg slot_set [DEPTH-1:0];
reg [ADDRW-1:0] rd_ptr, wr_ptr;
reg empty_r, full_r;
wire [WIDTH-1:0] d0, d1;
wire d_set_n = slot_set[rd_ptr];
always @(posedge clk) begin
if (reset) begin
rd_ptr <= '0;
wr_ptr <= '0;
empty_r <= 1;
full_r <= 0;
end else begin
`ASSERT(~push || ~full, ("runtime error: writing to a full stack!"));
`ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!"));
`ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!"));
if (push) begin
rd_ptr <= wr_ptr;
wr_ptr <= wr_ptr + ADDRW'(1);
empty_r <= 0;
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
end else if (pop) begin
wr_ptr <= wr_ptr - ADDRW'(d_set_n);
rd_ptr <= rd_ptr - ADDRW'(d_set_n);
empty_r <= (rd_ptr == 0) && (d_set_n == 1);
full_r <= 0;
end
end
end
VX_dp_ram #(
.DATAW (WIDTH * 2),
.SIZE (DEPTH),
.OUT_REG (OUT_REG ? 1 : 0),
.LUTRAM (OUT_REG ? 0 : 1)
) store (
.clk (clk),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),
.rdata ({d1, d0})
);
always @(posedge clk) begin
if (push) begin
slot_set[wr_ptr] <= 0;
end else if (pop) begin
slot_set[rd_ptr] <= 1;
end
end
wire d_set_r;
VX_pipe_register #(
.DATAW (1),
.DEPTH (OUT_REG)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in (d_set_n),
.data_out (d_set_r)
);
assign d = d_set_r ? d0 : d1;
assign d_set = ~d_set_r;
assign empty = empty_r;
assign full = full_r;
endmodule

180
hw/rtl/core/VX_issue.sv Normal file
View File

@@ -0,0 +1,180 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_issue #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.issue perf_issue_if,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
);
VX_ibuffer_if ibuffer_if [`ISSUE_WIDTH]();
VX_ibuffer_if scoreboard_if [`ISSUE_WIDTH]();
VX_operands_if operands_if [`ISSUE_WIDTH]();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.CORE_ID (CORE_ID)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.CORE_ID (CORE_ID)
) operands (
.clk (clk),
.reset (operands_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.CORE_ID (CORE_ID)
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
.perf_stalls (perf_issue_if.dsp_stalls),
`endif
.operands_if (operands_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE
wire operands_if_fire = operands_if[0].valid && operands_if[0].ready;
wire operands_if_not_ready = ~operands_if[0].ready;
wire writeback_if_valid = writeback_if[0].valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS +
1 + `NR_BITS + `XLEN + 1 + 1 + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes({
operands_if[0].data.uuid,
operands_if[0].data.tmask,
operands_if[0].data.ex_type,
operands_if[0].data.op_type,
operands_if[0].data.op_mod,
operands_if[0].data.wb,
operands_if[0].data.rd,
operands_if[0].data.imm,
operands_if[0].data.use_PC,
operands_if[0].data.use_imm,
operands_if[0].data.rs1_data,
operands_if[0].data.rs2_data,
operands_if[0].data.rs3_data,
writeback_if[0].data.uuid,
writeback_if[0].data.tmask,
writeback_if[0].data.rd,
writeback_if[0].data.data,
writeback_if[0].data.eop
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({operands_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, operands_if.PC, operands_if.tmask, operands_if.wid, operands_if.ex_type, operands_if.op_type, operands_if.ready, operands_if.valid}),
.probe1 ({writeback_if.uuid, writeback_if.data[0], writeback_if.PC, writeback_if.tmask, writeback_if.wid, writeback_if.eop, writeback_if.valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
end
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
perf_scb_stalls <= '0;
end else begin
if (decode_if.valid && ~decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
end
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
assign perf_issue_if.scb_stalls = perf_scb_stalls;
`endif
endmodule

647
hw/rtl/core/VX_lsu_unit.sv Normal file
View File

@@ -0,0 +1,647 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lsu_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
// Dcache interface
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS],
// inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH]
);
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
localparam REQ_ASHIFT = `CLOG2(DCACHE_WORD_SIZE);
localparam CACHE_TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + LSUQ_TAG_BITS;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_st_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_ld_if();
`UNUSED_VAR (execute_if[0].data.op_mod)
`UNUSED_VAR (execute_if[0].data.use_PC)
`UNUSED_VAR (execute_if[0].data.use_imm)
`UNUSED_VAR (execute_if[0].data.rs3_data)
`UNUSED_VAR (execute_if[0].data.tid)
`ifdef SM_ENABLE
`STATIC_ASSERT((1 << `SMEM_LOG_SIZE) == `MEM_BLOCK_SIZE * ((1 << `SMEM_LOG_SIZE) / `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % (1 << `SMEM_LOG_SIZE)), ("invalid parameter"))
localparam SMEM_START_B = MEM_ADDRW'(`XLEN'(`SMEM_BASE_ADDR) >> MEM_ASHIFT);
localparam SMEM_END_B = MEM_ADDRW'((`XLEN'(`SMEM_BASE_ADDR) + (1 << `SMEM_LOG_SIZE)) >> MEM_ASHIFT);
`endif
// tag = uuid + addr_type + wid + PC + tmask + rd + op_type + align + is_dup + pid + pkt_addr
localparam TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + `NW_WIDTH + `XLEN + NUM_LANES + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + `LSU_DUP_ENABLED + PID_WIDTH + LSUQ_SIZEW;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type;
// full address calculation
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if[0].data.rs1_data[i][`XLEN-1:0] + execute_if[0].data.imm;
end
// detect duplicate addresses
wire lsu_is_dup;
`ifdef LSU_DUP
if (NUM_LANES > 1) begin
wire [NUM_LANES-2:0] addr_matches;
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
assign addr_matches[i] = (execute_if[0].data.rs1_data[i+1] == execute_if[0].data.rs1_data[0]) || ~execute_if[0].data.tmask[i+1];
end
assign lsu_is_dup = execute_if[0].data.tmask[0] && (& addr_matches);
end else begin
assign lsu_is_dup = 0;
end
`else
assign lsu_is_dup = 0;
`endif
// detect address type
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] full_addr_b = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is non-cacheable I/O address
wire is_addr_io = (full_addr_b >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
`ifdef SM_ENABLE
// is shared memory address
wire is_addr_sm = (full_addr_b >= SMEM_START_B) && (full_addr_b < SMEM_END_B);
assign lsu_addr_type[i] = {is_addr_io, is_addr_sm};
`else
assign lsu_addr_type[i] = is_addr_io;
`endif
end
wire mem_req_empty;
wire st_rsp_ready;
wire lsu_valid, lsu_ready;
// fence: stall the pipeline until all pending requests are sent
wire is_fence = `INST_LSU_IS_FENCE(execute_if[0].data.op_type);
wire fence_wait = is_fence && ~mem_req_empty;
assign lsu_valid = execute_if[0].valid && ~fence_wait;
assign execute_if[0].ready = lsu_ready && ~fence_wait;
// schedule memory request
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
reg [NUM_LANES-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][`XLEN-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
wire mem_rsp_ready;
assign mem_req_valid = lsu_valid;
assign lsu_ready = mem_req_ready
&& (~mem_req_rw || st_rsp_ready); // writes commit directly
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mem_req_mask[i] = execute_if[0].data.tmask[i] && (~lsu_is_dup || (i == 0));
end
assign mem_req_rw = ~execute_if[0].data.wb;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
// address formatting
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_byteen[i] = '0;
case (`INST_LSU_WSIZE(execute_if[0].data.op_type))
0: begin // 8-bit
mem_req_byteen[i][req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {DCACHE_WORD_SIZE{1'b1}};
endcase
end
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if[0].valid && execute_if[0].ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if[0].data.tmask[i] || is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if[0].data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if[0].data.wid, execute_if[0].data.PC, full_addr[i], `INST_LSU_WSIZE(execute_if[0].data.op_type), execute_if[0].data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if[0].data.rs2_data[i];
case (req_align[i])
1: mem_req_data[i][`XLEN-1:8] = execute_if[0].data.rs2_data[i][`XLEN-9:0];
2: mem_req_data[i][`XLEN-1:16] = execute_if[0].data.rs2_data[i][`XLEN-17:0];
3: mem_req_data[i][`XLEN-1:24] = execute_if[0].data.rs2_data[i][`XLEN-25:0];
`ifdef XLEN_64
4: mem_req_data[i][`XLEN-1:32] = execute_if[0].data.rs2_data[i][`XLEN-33:0];
5: mem_req_data[i][`XLEN-1:40] = execute_if[0].data.rs2_data[i][`XLEN-41:0];
6: mem_req_data[i][`XLEN-1:48] = execute_if[0].data.rs2_data[i][`XLEN-49:0];
7: mem_req_data[i][`XLEN-1:56] = execute_if[0].data.rs2_data[i][`XLEN-57:0];
`endif
default:;
endcase
end
end
// track SOP/EOP for out-of-order memory responses
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
if (PID_BITS != 0) begin
reg [`LSUQ_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && execute_if[0].data.wb;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if[0].data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if[0].data.eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire full;
VX_allocator #(
.SIZE (`LSUQ_SIZE)
) pkt_allocator (
.clk (clk),
.reset (reset),
.acquire_en (mem_req_rd_eop_fire),
.acquire_addr(pkt_waddr),
.release_en (mem_rsp_eop_pkt),
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
);
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rd_during_wr) begin
if (mem_req_rd_fire) begin
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
end
end
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (pkt_raddr)
end
assign mem_req_tag = {
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
`ifdef LSU_DUP
, lsu_is_dup
`endif
};
wire [DCACHE_NUM_REQS-1:0] cache_req_valid;
wire [DCACHE_NUM_REQS-1:0] cache_req_rw;
wire [DCACHE_NUM_REQS-1:0][(`XLEN/8)-1:0] cache_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_req_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_NUM_REQS-1:0] cache_req_ready;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_valid;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_rsp_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched", CORE_ID)),
.NUM_REQS (LSU_MEM_REQS),
.NUM_BANKS (DCACHE_NUM_REQS),
.ADDR_WIDTH (DCACHE_ADDR_WIDTH),
.DATA_WIDTH (`XLEN),
.QUEUE_SIZE (`LSUQ_SIZE),
.TAG_WIDTH (TAG_WIDTH),
.MEM_TAG_ID (`UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS)),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_REG (2)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
// Input request
.req_valid (mem_req_valid),
.req_rw (mem_req_rw),
.req_mask (mem_req_mask),
.req_byteen (mem_req_byteen),
.req_addr (mem_req_addr),
.req_data (mem_req_data),
.req_tag (mem_req_tag),
.req_empty (mem_req_empty),
.req_ready (mem_req_ready),
`UNUSED_PIN (write_notify),
// Output response
.rsp_valid (mem_rsp_valid),
.rsp_mask (mem_rsp_mask),
.rsp_data (mem_rsp_data),
.rsp_tag (mem_rsp_tag),
.rsp_sop (mem_rsp_sop),
.rsp_eop (mem_rsp_eop),
.rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (cache_req_valid),
.mem_req_rw (cache_req_rw),
.mem_req_byteen (cache_req_byteen),
.mem_req_addr (cache_req_addr),
.mem_req_data (cache_req_data),
.mem_req_tag (cache_req_tag),
.mem_req_ready (cache_req_ready),
// Memory response
.mem_rsp_valid (cache_rsp_valid),
.mem_rsp_data (cache_rsp_data),
.mem_rsp_tag (cache_rsp_tag),
.mem_rsp_ready (cache_rsp_ready)
);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign cache_bus_if[i].req_valid = cache_req_valid[i];
assign cache_bus_if[i].req_data.rw = cache_req_rw[i];
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[i].req_data.data = cache_req_data[i];
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
end
// cache tag formatting: <uuid, tag, type>
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
wire [`UUID_WIDTH-1:0] cache_req_uuid, cache_rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type, cache_rsp_type;
wire [`CLOG2(`LSUQ_SIZE)-1:0] cache_req_tag_x, cache_rsp_tag_x;
if (DCACHE_NUM_BATCHES > 1) begin
wire [DCACHE_NUM_BATCHES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_b, cache_rsp_type_b;
wire [`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_bi, cache_rsp_type_bi;
wire [DCACHE_BATCH_SEL_BITS-1:0] cache_req_bid, cache_rsp_bid;
assign {cache_req_uuid, cache_req_type, cache_req_bid, cache_req_tag_x} = cache_req_tag[i];
assign cache_req_type_bi = cache_req_type_b[cache_req_bid];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_bid, cache_req_tag_x, cache_req_type_bi};
assign {cache_rsp_uuid, cache_rsp_bid, cache_rsp_tag_x, cache_rsp_type_bi} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_type_b = {DCACHE_NUM_BATCHES{cache_rsp_type_bi}};
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_bid, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_BATCHES; ++j) begin
localparam k = j * DCACHE_NUM_REQS + i;
if (k < NUM_LANES) begin
assign cache_req_type_b[j] = cache_req_type[k];
assign cache_rsp_type[k] = cache_rsp_type_b[j];
end else begin
assign cache_req_type_b[j] = '0;
`UNUSED_VAR (cache_rsp_type_b[j])
end
end
end else begin
assign {cache_req_uuid, cache_req_type, cache_req_tag_x} = cache_req_tag[i];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_tag_x, cache_req_type[i]};
assign {cache_rsp_uuid, cache_rsp_tag_x, cache_rsp_type[i]} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_REQS; ++j) begin
if (i != j) begin
`UNUSED_VAR (cache_req_type[j])
assign cache_rsp_type[j] = '0;
end
end
end
end
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] rsp_addr_type;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [NUM_LANES-1:0] rsp_tmask_uq;
wire [`XLEN-1:0] rsp_pc;
wire [`NR_BITS-1:0] rsp_rd;
wire [`INST_LSU_BITS-1:0] rsp_op_type;
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
wire [PID_WIDTH-1:0] rsp_pid;
wire rsp_is_dup;
`ifndef LSU_DUP
assign rsp_is_dup = 0;
`endif
assign {
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
`ifdef LSU_DUP
, rsp_is_dup
`endif
} = mem_rsp_tag;
`UNUSED_VAR (rsp_addr_type)
`UNUSED_VAR (rsp_op_type)
// load response formatting
reg [NUM_LANES-1:0][`XLEN-1:0] rsp_data;
wire [NUM_LANES-1:0] rsp_tmask;
`ifdef XLEN_64
`ifdef EXT_F_ENABLE
// apply nan-boxing to flw outputs
wire rsp_is_float = rsp_rd[5];
`else
wire rsp_is_float = 0;
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = (i == 0 || rsp_is_dup) ? mem_rsp_data[0] : mem_rsp_data[i];
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? (rsp_align[0][2] ? mem_rsp_data[0][63:32] : mem_rsp_data[0][31:0]) :
(rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
`else
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? mem_rsp_data[0] : mem_rsp_data[i];
`endif
wire [15:0] rsp_data16 = rsp_align[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_op_type))
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
`ifdef XLEN_64
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
`else
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
`endif
default: rsp_data[i] = 'x;
endcase
end
end
assign rsp_tmask = rsp_is_dup ? rsp_tmask_uq : mem_rsp_mask;
// load commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (2)
) ld_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
.valid_out (commit_ld_if.valid),
.ready_out (commit_ld_if.ready)
);
assign commit_ld_if.data.wb = 1'b1;
// store commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1),
.SIZE (2)
) st_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_fire && mem_req_rw),
.ready_in (st_rsp_ready),
.data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}),
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
.valid_out (commit_st_if.valid),
.ready_out (commit_st_if.ready)
);
assign commit_st_if.data.rd = '0;
assign commit_st_if.data.wb = 1'b0;
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
// lsu commit
`RESET_RELAY (commit_reset, reset);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_arb_if[1]();
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (1)
) rsp_arb (
.clk (clk),
.reset (commit_reset),
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
.data_in ({commit_st_if.data, commit_ld_if.data}),
.data_out (commit_arb_if[0].data),
.valid_out (commit_arb_if[0].valid),
.ready_out (commit_arb_if[0].ready),
`UNUSED_PIN (sel_out)
);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (3)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_arb_if),
.commit_out_if (commit_if)
);
`ifdef DBG_SCOPE_LSU
if (CORE_ID == 0) begin
`ifdef SCOPE
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (`UUID_WIDTH+NUM_LANES*(`XLEN+4+`XLEN)+1+`UUID_WIDTH+NUM_LANES*`XLEN)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes({execute_if[0].data.uuid, full_addr, mem_req_rw, mem_req_byteen, mem_req_data, rsp_uuid, rsp_data}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
wire [31:0] full_addr_0 = full_addr[0];
wire [31:0] mem_req_data_0 = mem_req_data[0];
wire [31:0] rsp_data_0 = rsp_data[0];
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({mem_req_data_0, execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, rsp_tmask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({cache_bus_if.req_data.data, cache_bus_if.req_data.tag, cache_bus_if.req_data.byteen, cache_bus_if.req_data.addr, cache_bus_if.req_data.rw, cache_bus_if.req_ready, cache_bus_if.req_valid}),
.probe3 ({cache_bus_if.rsp_data.data, cache_bus_if.rsp_data.tag, cache_bus_if.rsp_ready, cache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_CORE_DCACHE
always @(posedge clk) begin
if (execute_if[0].valid && fence_wait) begin
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, mem_req_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", lsu_is_dup, execute_if[0].data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", rd=%0d, is_dup=%b (#%0d)\n", execute_if[0].data.rd, lsu_is_dup, execute_if[0].data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, tag=0x%0h, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, rsp_pc, mem_rsp_mask, mem_rsp_tag, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, mem_rsp_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid));
end
end
`endif
endmodule

View File

@@ -0,0 +1,336 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_muldiv_unit #(
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAGW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
wire [`INST_M_BITS-1:0] muldiv_op = `INST_M_BITS'(execute_if.data.op_type);
wire is_mulx_op = `INST_M_IS_MULX(muldiv_op);
wire is_signed_op = `INST_M_SIGNED(muldiv_op);
`ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod);
`else
wire is_alu_w = 0;
`endif
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_out;
wire [`UUID_WIDTH-1:0] mul_uuid_out;
wire [`NW_WIDTH-1:0] mul_wid_out;
wire [NUM_LANES-1:0] mul_tmask_out;
wire [`XLEN-1:0] mul_PC_out;
wire [`NR_BITS-1:0] mul_rd_out;
wire mul_wb_out;
wire [PID_WIDTH-1:0] mul_pid_out;
wire mul_sop_out, mul_eop_out;
wire mul_valid_in = execute_if.valid && is_mulx_op;
wire mul_ready_in;
wire mul_valid_out;
wire mul_ready_out;
wire is_mulh_in = `INST_M_IS_MULH(muldiv_op);
wire is_signed_mul_a = `INST_M_SIGNED_A(muldiv_op);
wire is_signed_mul_b = is_signed_op;
`ifdef IMUL_DPI
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_tmp;
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
always @(*) begin
dpi_imul (mul_fire_in, is_signed_mul_a, is_signed_mul_b, mul_in1, mul_in2, mul_resultl, mul_resulth);
end
assign mul_result_tmp[i] = is_mulh_in ? mul_resulth : (is_alu_w ? `XLEN'($signed(mul_resultl[31:0])) : mul_resultl);
end
VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}),
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out})
);
assign mul_ready_in = mul_ready_out || ~mul_valid_out;
`else
wire [NUM_LANES-1:0][2*(`XLEN+1)-1:0] mul_result_tmp;
wire is_mulh_out;
wire is_mul_w_out;
`ifdef XLEN_64
wire [NUM_LANES-1:0][`XLEN:0] mul_in1;
wire [NUM_LANES-1:0][`XLEN:0] mul_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
end
wire mul_strode;
wire mul_busy;
VX_elastic_adapter mul_elastic_adapter (
.clk (clk),
.reset (reset),
.valid_in (mul_valid_in),
.ready_in (mul_ready_in),
.valid_out (mul_valid_out),
.ready_out (mul_ready_out),
.strobe (mul_strode),
.busy (mul_busy)
);
VX_serial_mul #(
.A_WIDTH (`XLEN+1),
.LANES (NUM_LANES),
.SIGNED (1)
) serial_mul (
.clk (clk),
.reset (reset),
.strobe (mul_strode),
.busy (mul_busy),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp)
);
reg [TAGW+2-1:0] mul_tag_r;
always @(posedge clk) begin
if (mul_valid_in && mul_ready_in) begin
mul_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_mulh_in, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
end
end
assign {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out, is_mul_w_out, mul_pid_out, mul_sop_out, mul_eop_out} = mul_tag_r;
`else
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
VX_multiplier #(
.A_WIDTH (`XLEN+1),
.B_WIDTH (`XLEN+1),
.R_WIDTH (2*(`XLEN+1)),
.SIGNED (1),
.LATENCY (`LATENCY_IMUL)
) multiplier (
.clk (clk),
.enable (mul_ready_in),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp[i])
);
end
VX_shift_register #(
.DATAW (1 + TAGW + 1 + 1),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, is_mulh_in, is_alu_w}),
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, is_mulh_out, is_mul_w_out})
);
assign mul_ready_in = mul_ready_out || ~mul_valid_out;
`endif
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
mul_result_tmp[i][`XLEN-1:0]);
`else
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] : mul_result_tmp[i][`XLEN-1:0];
`UNUSED_VAR (is_mul_w_out)
`endif
end
`endif
///////////////////////////////////////////////////////////////////////////
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_out;
wire [`UUID_WIDTH-1:0] div_uuid_out;
wire [`NW_WIDTH-1:0] div_wid_out;
wire [NUM_LANES-1:0] div_tmask_out;
wire [`XLEN-1:0] div_PC_out;
wire [`NR_BITS-1:0] div_rd_out;
wire div_wb_out;
wire [PID_WIDTH-1:0] div_pid_out;
wire div_sop_out, div_eop_out;
wire is_rem_op = `INST_M_IS_REM(muldiv_op);
wire div_valid_in = execute_if.valid && ~is_mulx_op;
wire div_ready_in;
wire div_valid_out;
wire div_ready_out;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
end
`ifdef IDIV_DPI
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
end
assign div_result_in[i] = is_rem_op ? (is_alu_w ? `XLEN'($signed(div_remainder[31:0])) : div_remainder) :
(is_alu_w ? `XLEN'($signed(div_quotient[31:0])) : div_quotient);
end
VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) div_shift_reg (
.clk(clk),
.reset (reset),
.enable (div_ready_in),
.data_in ({div_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, div_result_in}),
.data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out})
);
assign div_ready_in = div_ready_out || ~div_valid_out;
`else
wire [NUM_LANES-1:0][`XLEN-1:0] div_quotient, div_remainder;
wire is_rem_op_out;
wire is_div_w_out;
wire div_strode;
wire div_busy;
VX_elastic_adapter div_elastic_adapter (
.clk (clk),
.reset (reset),
.valid_in (div_valid_in),
.ready_in (div_ready_in),
.valid_out (div_valid_out),
.ready_out (div_ready_out),
.strobe (div_strode),
.busy (div_busy)
);
VX_serial_div #(
.WIDTHN (`XLEN),
.WIDTHD (`XLEN),
.WIDTHQ (`XLEN),
.WIDTHR (`XLEN),
.LANES (NUM_LANES)
) serial_div (
.clk (clk),
.reset (reset),
.strobe (div_strode),
.busy (div_busy),
.is_signed (is_signed_op),
.numer (div_in1),
.denom (div_in2),
.quotient (div_quotient),
.remainder (div_remainder)
);
reg [TAGW+2-1:0] div_tag_r;
always @(posedge clk) begin
if (div_valid_in && div_ready_in) begin
div_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_rem_op, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
end
end
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
`else
assign div_result_out[i] = is_rem_op_out ? div_remainder[i] : div_quotient[i];
`UNUSED_VAR (is_div_w_out)
`endif
end
`endif
// can accept new request?
assign execute_if.ready = is_mulx_op ? mul_ready_in : div_ready_in;
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAGW + (NUM_LANES * `XLEN)),
.OUT_REG (1)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in ({div_valid_out, mul_valid_out}),
.ready_in ({div_ready_out, mul_ready_out}),
.data_in ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out},
{mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready),
`UNUSED_PIN (sel_out)
);
endmodule

302
hw/rtl/core/VX_operands.sv Normal file
View File

@@ -0,0 +1,302 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam STATE_IDLE = 2'd0;
localparam STATE_FETCH1 = 2'd1;
localparam STATE_FETCH2 = 2'd2;
localparam STATE_FETCH3 = 2'd3;
localparam STATE_BITS = 2;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
reg [STATE_BITS-1:0] state, state_n;
reg [`NR_BITS-1:0] rs2, rs2_n;
reg [`NR_BITS-1:0] rs3, rs3_n;
reg rs2_ready, rs2_ready_n;
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
VX_operands_if staging_if();
always @(*) begin
state_n = state;
rs2_n = rs2;
rs3_n = rs3;
rs2_ready_n = rs2_ready;
rs3_ready_n = rs3_ready;
rs1_data_n = rs1_data;
rs2_data_n = rs2_data;
rs3_data_n = rs3_data;
cache_data_n = cache_data;
cache_reg_n = cache_reg;
cache_tmask_n= cache_tmask;
cache_eop_n = cache_eop;
gpr_rd_rid_n = gpr_rd_rid;
gpr_rd_wis_n = gpr_rd_wis;
data_ready_n = data_ready;
case (state)
STATE_IDLE: begin
if (staging_if.valid && staging_if.ready) begin
data_ready_n = 0;
end
if (scoreboard_if[i].valid && data_ready_n == 0) begin
data_ready_n = 1;
if (is_rs3_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs3 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs3_data_n = (is_rs3_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
rs3_ready_n = 1;
end else begin
rs3_ready_n = 0;
gpr_rd_rid_n = scoreboard_if[i].data.rs3;
data_ready_n = 0;
state_n = STATE_FETCH3;
end
if (is_rs2_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs2 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs2_data_n = (is_rs2_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
rs2_ready_n = 1;
end else begin
rs2_ready_n = 0;
gpr_rd_rid_n = scoreboard_if[i].data.rs2;
data_ready_n = 0;
state_n = STATE_FETCH2;
end
if (is_rs1_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs1 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs1_data_n = (is_rs1_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
end else begin
gpr_rd_rid_n = scoreboard_if[i].data.rs1;
data_ready_n = 0;
state_n = STATE_FETCH1;
end
end
gpr_rd_wis_n = scoreboard_if[i].data.wis;
rs2_n = scoreboard_if[i].data.rs2;
rs3_n = scoreboard_if[i].data.rs3;
end
STATE_FETCH1: begin
rs1_data_n = gpr_rd_data;
if (~rs2_ready) begin
gpr_rd_rid_n = rs2;
state_n = STATE_FETCH2;
end else if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH2: begin
rs2_data_n = gpr_rd_data;
if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH3: begin
rs3_data_n = gpr_rd_data;
data_ready_n = 1;
state_n = STATE_IDLE;
end
endcase
if (CACHE_ENABLE != 0 && writeback_if[i].valid) begin
if ((cache_reg[writeback_if[i].data.wis] == writeback_if[i].data.rd)
|| (cache_eop[writeback_if[i].data.wis] && writeback_if[i].data.sop)) begin
for (integer j = 0; j < `NUM_THREADS; ++j) begin
if (writeback_if[i].data.tmask[j]) begin
cache_data_n[writeback_if[i].data.wis][j] = writeback_if[i].data.data[j];
end
end
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
if (writeback_if[i].data.sop) begin
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
end else begin
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
end
end
end
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
gpr_rd_rid <= '0;
gpr_rd_wis <= '0;
cache_eop <= {ISSUE_RATIO{1'b1}};
cache_reg <= '0;
data_ready <= 0;
end else begin
state <= state_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
end
end
// GPR banks
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)),
.wdata (writeback_if[i].data.data[j]),
.raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)),
.rdata (gpr_rd_data[j])
);
end
// staging buffer
`RESET_RELAY (stg_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (scoreboard_if[i].valid),
.ready_in (scoreboard_if[i].ready),
.data_in ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd}),
.data_out ({
staging_if.data.uuid,
staging_if.data.wis,
staging_if.data.tmask,
staging_if.data.PC,
staging_if.data.wb,
staging_if.data.ex_type,
staging_if.data.op_type,
staging_if.data.op_mod,
staging_if.data.use_PC,
staging_if.data.use_imm,
staging_if.data.imm,
staging_if.data.rd}),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
assign staging_if.data.rs1_data = rs1_data;
assign staging_if.data.rs2_data = rs2_data;
assign staging_if.data.rs3_data = rs3_data;
// output buffer
wire valid_stg, ready_stg;
assign valid_stg = staging_if.valid && data_ready;
assign staging_if.ready = ready_stg && data_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (operands_if[i].data),
.valid_out (operands_if[i].valid),
.ready_out (operands_if[i].ready)
);
end
endmodule

View File

@@ -0,0 +1,79 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_pending_instr #(
parameter CTR_WIDTH = 12,
parameter ALM_EMPTY = 1,
parameter DECR_COUNT = 1
) (
input wire clk,
input wire reset,
input wire incr,
input wire [`NW_WIDTH-1:0] incr_wid,
input wire [DECR_COUNT-1:0] decr,
input wire [DECR_COUNT-1:0][`NW_WIDTH-1:0] decr_wid,
input wire [`NW_WIDTH-1:0] alm_empty_wid,
output wire empty,
output wire alm_empty
);
localparam COUNTW = `CLOG2(DECR_COUNT+1);
reg [`NUM_WARPS-1:0][CTR_WIDTH-1:0] pending_instrs;
reg [`NUM_WARPS-1:0][COUNTW-1:0] decr_cnt;
reg [`NUM_WARPS-1:0][DECR_COUNT-1:0] decr_mask;
reg [`NUM_WARPS-1:0] incr_cnt, incr_cnt_n;
reg [`NUM_WARPS-1:0] alm_empty_r, empty_r;
always @(*) begin
incr_cnt_n = 0;
decr_mask = 0;
if (incr) begin
incr_cnt_n[incr_wid] = 1;
end
for (integer i = 0; i < DECR_COUNT; ++i) begin
if (decr[i]) begin
decr_mask[decr_wid[i]][i] = 1;
end
end
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire [COUNTW-1:0] decr_cnt_n;
`POP_COUNT(decr_cnt_n, decr_mask[i]);
wire [CTR_WIDTH-1:0] pending_instrs_n = pending_instrs[i] + CTR_WIDTH'(incr_cnt[i]) - CTR_WIDTH'(decr_cnt[i]);
always @(posedge clk) begin
if (reset) begin
incr_cnt[i] <= '0;
decr_cnt[i] <= '0;
pending_instrs[i] <= '0;
alm_empty_r[i] <= 0;
empty_r[i] <= 1;
end else begin
incr_cnt[i] <= incr_cnt_n[i];
decr_cnt[i] <= decr_cnt_n;
pending_instrs[i] <= pending_instrs_n;
alm_empty_r[i] <= (pending_instrs_n == ALM_EMPTY);
empty_r[i] <= (pending_instrs_n == 0);
end
end
end
assign alm_empty = alm_empty_r[alm_empty_wid];
assign empty = (& empty_r);
endmodule

379
hw/rtl/core/VX_schedule.sv Normal file
View File

@@ -0,0 +1,379 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_schedule import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// configuration
input base_dcrs_t base_dcrs,
// inputsdecode_if
VX_warp_ctl_if.slave warp_ctl_if,
VX_branch_ctl_if.slave branch_ctl_if [`NUM_ALU_BLOCKS],
VX_decode_sched_if.slave decode_sched_if,
VX_commit_sched_if.slave commit_sched_if,
// outputs
VX_schedule_if.master schedule_if,
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
`endif
VX_sched_csr_if.master sched_csr_if,
// status
output wire busy
);
`UNUSED_PARAM (CORE_ID)
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
reg [`NUM_WARPS-1:0] stalled_warps, stalled_warps_n; // set when branch/gpgpu instructions are issued
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n;
reg [`NUM_WARPS-1:0][`XLEN-1:0] warp_pcs, warp_pcs_n;
wire [`NW_WIDTH-1:0] schedule_wid;
wire [`NUM_THREADS-1:0] schedule_tmask;
wire [`XLEN-1:0] schedule_pc;
wire schedule_valid;
wire schedule_ready;
// split/join
wire join_valid;
wire join_is_dvg;
wire join_is_else;
wire [`NW_WIDTH-1:0] join_wid;
wire [`NUM_THREADS-1:0] join_tmask;
wire [`XLEN-1:0] join_pc;
reg [`PERF_CTR_BITS-1:0] cycles;
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
wire schedule_fire = schedule_valid && schedule_ready;
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
// branch
wire [`NUM_ALU_BLOCKS-1:0] branch_valid;
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
wire [`NUM_ALU_BLOCKS-1:0][`XLEN-1:0] branch_dest;
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
assign branch_valid[i] = branch_ctl_if[i].valid;
assign branch_wid[i] = branch_ctl_if[i].wid;
assign branch_taken[i] = branch_ctl_if[i].taken;
assign branch_dest[i] = branch_ctl_if[i].dest;
end
// barriers
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n;
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count;
wire [`NUM_WARPS-1:0] curr_barrier_mask;
`ifdef GBAR_ENABLE
reg [`NUM_WARPS-1:0] curr_barrier_mask_n;
reg gbar_req_valid;
reg [`NB_WIDTH-1:0] gbar_req_id;
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
`endif
assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
`POP_COUNT(active_barrier_count, curr_barrier_mask);
`UNUSED_VAR (active_barrier_count)
always @(*) begin
active_warps_n = active_warps;
stalled_warps_n = stalled_warps;
thread_masks_n = thread_masks;
barrier_masks_n = barrier_masks;
barrier_stalls_n= barrier_stalls;
warp_pcs_n = warp_pcs;
// wspawn handling
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps_n |= warp_ctl_if.wspawn.wmask;
for (integer i = 0; i < `NUM_WARPS; ++i) begin
if (warp_ctl_if.wspawn.wmask[i]) begin
thread_masks_n[i][0] = 1;
warp_pcs_n[i] = warp_ctl_if.wspawn.pc;
end
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
// TMC handling
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.tmc.tmask;
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
// split handling
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
if (warp_ctl_if.split.is_dvg) begin
thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.split.then_tmask;
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
// join handling
if (join_valid) begin
if (join_is_dvg) begin
if (join_is_else) begin
warp_pcs_n[join_wid] = join_pc;
end
thread_masks_n[join_wid] = join_tmask;
end
stalled_warps_n[join_wid] = 0; // unlock warp
end
// barrier handling
`ifdef GBAR_ENABLE
curr_barrier_mask_n = curr_barrier_mask;
curr_barrier_mask_n[warp_ctl_if.wid] = 1;
`endif
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
if (~warp_ctl_if.barrier.is_global
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
barrier_masks_n[warp_ctl_if.barrier.id] = '0;
barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id];
end else begin
barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1;
barrier_stalls_n[warp_ctl_if.wid] = 1;
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
`ifdef GBAR_ENABLE
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
barrier_masks_n[gbar_bus_if.rsp_id] = '0;
barrier_stalls_n = '0; // unlock all warps
end
`endif
// Branch handling
for (integer i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
if (branch_valid[i]) begin
if (branch_taken[i]) begin
warp_pcs_n[branch_wid[i]] = branch_dest[i];
end
stalled_warps_n[branch_wid[i]] = 0; // unlock warp
end
end
// decode unlock
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
// CSR unlock
if (sched_csr_if.unlock_warp) begin
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
// stall the warp until decode stage
if (schedule_fire) begin
stalled_warps_n[schedule_wid] = 1;
end
// advance PC
if (schedule_if_fire) begin
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + 4;
end
end
`UNUSED_VAR (base_dcrs)
always @(posedge clk) begin
if (reset) begin
barrier_masks <= '0;
`ifdef GBAR_ENABLE
gbar_req_valid <= 0;
`endif
stalled_warps <= '0;
warp_pcs <= '0;
active_warps <= '0;
thread_masks <= '0;
barrier_stalls <= '0;
issued_instrs <= '0;
cycles <= '0;
// activate first warp
warp_pcs[0] <= base_dcrs.startup_addr;
active_warps[0] <= 1;
thread_masks[0][0] <= 1;
end else begin
active_warps <= active_warps_n;
stalled_warps <= stalled_warps_n;
thread_masks <= thread_masks_n;
warp_pcs <= warp_pcs_n;
barrier_masks <= barrier_masks_n;
barrier_stalls <= barrier_stalls_n;
// global barrier scheduling
`ifdef GBAR_ENABLE
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid
&& warp_ctl_if.barrier.is_global
&& (curr_barrier_mask_n == active_warps)) begin
gbar_req_valid <= 1;
gbar_req_id <= warp_ctl_if.barrier.id;
gbar_req_size_m1 <= warp_ctl_if.barrier.size_m1[`NC_WIDTH-1:0];
end
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
gbar_req_valid <= 0;
end
`endif
if (schedule_if_fire) begin
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
end
if (busy) begin
cycles <= cycles + 1;
end
end
end
// barrier handling
`ifdef GBAR_ENABLE
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_id = gbar_req_id;
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
`endif
// split/join handling
`RESET_RELAY (split_join_reset, reset);
VX_split_join #(
.CORE_ID (CORE_ID)
) split_join (
.clk (clk),
.reset (split_join_reset),
.valid (warp_ctl_if.valid),
.wid (warp_ctl_if.wid),
.split (warp_ctl_if.split),
.sjoin (warp_ctl_if.sjoin),
.join_valid (join_valid),
.join_is_dvg (join_is_dvg),
.join_is_else (join_is_else),
.join_wid (join_wid),
.join_tmask (join_tmask),
.join_pc (join_pc)
);
// schedule the next ready warp
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
VX_lzc #(
.N (`NUM_WARPS),
.REVERSE (1)
) wid_select (
.data_in (ready_warps),
.data_out (schedule_wid),
.valid_out (schedule_valid)
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `XLEN)-1:0] schedule_data;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
end
assign {schedule_tmask, schedule_pc} = {
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-1:(`NUM_THREADS + `XLEN)-4],
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-5:0]
};
`ifndef NDEBUG
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
always @(posedge clk) begin
if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
end else if (schedule_fire) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
end
end
`else
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
`endif
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH)
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (schedule_valid),
.ready_in (schedule_ready),
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
.valid_out (schedule_if.valid),
.ready_out (schedule_if.ready)
);
assign schedule_if.data.uuid = instr_uuid;
`RESET_RELAY (pending_instr_reset, reset);
wire no_pending_instr;
VX_pending_instr #(
.CTR_WIDTH (12),
.DECR_COUNT (`ISSUE_WIDTH),
.ALM_EMPTY (1)
) pending_instr(
.clk (clk),
.reset (pending_instr_reset),
.incr (schedule_if_fire),
.incr_wid (schedule_if.data.wid),
.decr (commit_sched_if.committed),
.decr_wid (commit_sched_if.committed_wid),
.alm_empty_wid (sched_csr_if.alm_empty_wid),
.alm_empty (sched_csr_if.alm_empty),
.empty (no_pending_instr)
);
`BUFFER_BUSY (busy, (active_warps != 0 || ~no_pending_instr), 1);
// export CSRs
assign sched_csr_if.cycles = cycles;
assign sched_csr_if.active_warps = active_warps;
assign sched_csr_if.thread_masks = thread_masks;
// timeout handling
reg [31:0] timeout_ctr;
reg timeout_enable;
always @(posedge clk) begin
if (reset) begin
timeout_ctr <= '0;
timeout_enable <= 0;
end else begin
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
timeout_enable <= 1;
end
if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin
timeout_ctr <= timeout_ctr + 1;
end else if (active_warps == 0 || active_warps != stalled_warps) begin
timeout_ctr <= '0;
end
end
end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
endmodule

View File

@@ -0,0 +1,139 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_scoreboard import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
reg [3:0] ready_masks, ready_masks_n;
VX_ibuffer_if staging_if();
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
always @(*) begin
inuse_regs_n = inuse_regs;
ready_masks_n = ready_masks;
if (writeback_fire) begin
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0;
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
& {(writeback_if[i].data.rd == staging_if.data.rd),
(writeback_if[i].data.rd == staging_if.data.rs1),
(writeback_if[i].data.rd == staging_if.data.rs2),
(writeback_if[i].data.rd == staging_if.data.rs3)};
end
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1;
ready_masks_n = '0;
end
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]};
end
end
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
ready_masks <= '0;
end else begin
inuse_regs <= inuse_regs_n;
ready_masks <= ready_masks_n;
end
end
// staging buffer
`RESET_RELAY (stg_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (ibuffer_if[i].valid),
.ready_in (ibuffer_if[i].ready),
.data_in (ibuffer_if[i].data),
.data_out (staging_if.data),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
// output buffer
wire valid_stg, ready_stg;
wire regs_ready = (& ready_masks);
assign valid_stg = staging_if.valid && regs_ready;
assign staging_if.ready = ready_stg && regs_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (scoreboard_if[i].data),
.valid_out (scoreboard_if[i].valid),
.ready_out (scoreboard_if[i].ready)
);
reg [31:0] timeout_ctr;
always @(posedge clk) begin
if (reset) begin
timeout_ctr <= '0;
end else begin
if (staging_if.valid && ~regs_ready) begin
`ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (staging_if.valid && staging_if.ready) begin
timeout_ctr <= '0;
end
end
end
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
end
endmodule

209
hw/rtl/core/VX_sfu_unit.sv Normal file
View File

@@ -0,0 +1,209 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_sfu_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
input base_dcrs_t base_dcrs,
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS],
`endif
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_commit_csr_if.slave commit_csr_if,
VX_sched_csr_if.slave sched_csr_if,
VX_warp_ctl_if.master warp_ctl_if
);
`UNUSED_PARAM (CORE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1;
localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_CSR = 1;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
);
wire [RSP_ARB_SIZE-1:0] rsp_arb_valid_in;
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
`ifdef PERF_ENABLE
VX_sfu_perf_if sfu_perf_if();
`endif
// Warp control block
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) wctl_execute_if();
VX_commit_if#(
.NUM_LANES (NUM_LANES)
) wctl_commit_if();
assign wctl_execute_if.valid = execute_if[0].valid && `INST_SFU_IS_WCTL(execute_if[0].data.op_type);
assign wctl_execute_if.data = execute_if[0].data;
`RESET_RELAY (wctl_reset, reset);
VX_wctl_unit #(
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) wctl_unit (
.clk (clk),
.reset (wctl_reset),
.execute_if (wctl_execute_if),
.warp_ctl_if(warp_ctl_if),
.commit_if (wctl_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.data;
assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL];
// CSR unit
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) csr_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) csr_commit_if();
assign csr_execute_if.valid = execute_if[0].valid && `INST_SFU_IS_CSR(execute_if[0].data.op_type);
assign csr_execute_if.data = execute_if[0].data;
`RESET_RELAY (csr_reset, reset);
VX_csr_unit #(
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) csr_unit (
.clk (clk),
.reset (csr_reset),
.base_dcrs (base_dcrs),
.execute_if (csr_execute_if),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
.sched_csr_if (sched_csr_if),
.commit_csr_if (commit_csr_if),
.commit_if (csr_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data;
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR];
// can accept new request?
reg sfu_req_ready;
always @(*) begin
case (execute_if[0].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
default: sfu_req_ready = wctl_execute_if.ready;
endcase
end
assign execute_if[0].ready = sfu_req_ready;
// response arbitration
`RESET_RELAY (commit_reset, reset);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) arb_commit_if[BLOCK_SIZE]();
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) rsp_arb (
.clk (clk),
.reset (commit_reset),
.valid_in (rsp_arb_valid_in),
.ready_in (rsp_arb_ready_in),
.data_in (rsp_arb_data_in),
.data_out (arb_commit_if[0].data),
.valid_out (arb_commit_if[0].valid),
.ready_out (arb_commit_if[0].ready),
`UNUSED_PIN (sel_out)
);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (3)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (arb_commit_if),
.commit_out_if (commit_if)
);
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
always @(posedge clk) begin
if (reset) begin
perf_wctl_stalls <= '0;
end else begin
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready);
end
end
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
`endif
endmodule

124
hw/rtl/core/VX_smem_unit.sv Normal file
View File

@@ -0,0 +1,124 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_smem_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master dcache_bus_out_if [DCACHE_NUM_REQS]
);
`UNUSED_PARAM (CORE_ID)
localparam SMEM_ADDR_WIDTH = `SMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
wire [DCACHE_NUM_REQS-1:0] smem_req_valid;
wire [DCACHE_NUM_REQS-1:0] smem_req_rw;
wire [DCACHE_NUM_REQS-1:0][SMEM_ADDR_WIDTH-1:0] smem_req_addr;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] smem_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_req_data;
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_req_tag;
wire [DCACHE_NUM_REQS-1:0] smem_req_ready;
wire [DCACHE_NUM_REQS-1:0] smem_rsp_valid;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_rsp_data;
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_rsp_tag;
wire [DCACHE_NUM_REQS-1:0] smem_rsp_ready;
`RESET_RELAY (smem_reset, reset);
VX_shared_mem #(
.INSTANCE_ID($sformatf("core%0d-smem", CORE_ID)),
.SIZE (1 << `SMEM_LOG_SIZE),
.NUM_REQS (DCACHE_NUM_REQS),
.NUM_BANKS (`SMEM_NUM_BANKS),
.WORD_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (SMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) shared_mem (
.clk (clk),
.reset (smem_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
// Core request
.req_valid (smem_req_valid),
.req_rw (smem_req_rw),
.req_byteen (smem_req_byteen),
.req_addr (smem_req_addr),
.req_data (smem_req_data),
.req_tag (smem_req_tag),
.req_ready (smem_req_ready),
// Core response
.rsp_valid (smem_rsp_valid),
.rsp_data (smem_rsp_data),
.rsp_tag (smem_rsp_tag),
.rsp_ready (smem_rsp_ready)
);
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
`RESET_RELAY (switch_reset, reset);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign smem_req_valid[i] = switch_out_bus_if[i * 2 + 1].req_valid;
assign smem_req_rw[i] = switch_out_bus_if[i * 2 + 1].req_data.rw;
assign smem_req_byteen[i] = switch_out_bus_if[i * 2 + 1].req_data.byteen;
assign smem_req_data[i] = switch_out_bus_if[i * 2 + 1].req_data.data;
assign smem_req_tag[i] = switch_out_bus_if[i * 2 + 1].req_data.tag;
assign switch_out_bus_if[i * 2 + 1].req_ready = smem_req_ready[i];
assign switch_out_bus_if[i * 2 + 1].rsp_valid = smem_rsp_valid[i];
assign switch_out_bus_if[i * 2 + 1].rsp_data.data = smem_rsp_data[i];
assign switch_out_bus_if[i * 2 + 1].rsp_data.tag = smem_rsp_tag[i];
assign smem_rsp_ready[i] = switch_out_bus_if[i * 2 + 1].rsp_ready;
assign smem_req_addr[i] = switch_out_bus_if[i * 2 + 1].req_data.addr[SMEM_ADDR_WIDTH-1:0];
VX_smem_switch #(
.NUM_REQS (2),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_IDX (0),
.ARBITER ("P"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) smem_switch (
.clk (clk),
.reset (switch_reset),
.bus_in_if (dcache_bus_in_if[i]),
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
);
end
// this bus goes to the dcache
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2]);
end
endmodule

View File

@@ -0,0 +1,76 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_split_join import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
input wire valid,
input wire [`NW_WIDTH-1:0] wid,
input split_t split,
input join_t sjoin,
output wire join_valid,
output wire join_is_dvg,
output wire join_is_else,
output wire [`NW_WIDTH-1:0] join_wid,
output wire [`NUM_THREADS-1:0] join_tmask,
output wire [`XLEN-1:0] join_pc
);
`UNUSED_PARAM (CORE_ID)
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire ipdom_set [`NUM_WARPS-1:0];
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q0 = {split.then_tmask | split.else_tmask, `XLEN'(0)};
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q1 = {split.else_tmask, split.next_pc};
wire ipdom_push = valid && split.valid && split.is_dvg;
wire ipdom_pop = valid && sjoin.valid && sjoin.is_dvg;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY (ipdom_reset, reset);
VX_ipdom_stack #(
.WIDTH (`XLEN+`NUM_THREADS),
.DEPTH (`UP(`NUM_THREADS-1))
) ipdom_stack (
.clk (clk),
.reset (ipdom_reset),
.push (ipdom_push && (i == wid)),
.pop (ipdom_pop && (i == wid)),
.q0 (ipdom_q0),
.q1 (ipdom_q1),
.d (ipdom_data[i]),
.d_set (ipdom_set[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (full)
);
end
VX_pipe_register #(
.DATAW (1 + 1 + `NW_WIDTH + 1 + `XLEN + `NUM_THREADS),
.DEPTH (1),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({valid && sjoin.valid, sjoin.is_dvg, ipdom_set[wid], wid, ipdom_data[wid]}),
.data_out ({join_valid, join_is_dvg, join_is_else, join_wid, join_tmask, join_pc})
);
endmodule

379
hw/rtl/core/VX_trace.vh Normal file
View File

@@ -0,0 +1,379 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_TRACE_VH
`define VX_TRACE_VH
`ifndef SYNTHESIS
`include "VX_define.vh"
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"));
`EX_LSU: `TRACE(level, ("LSU"));
`EX_FPU: `TRACE(level, ("FPU"));
`EX_SFU: `TRACE(level, ("SFU"));
default: `TRACE(level, ("?"));
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input [`INST_MOD_BITS-1:0] op_mod,
`UNUSED_ARG(input [`NR_BITS-1:0] rd),
`UNUSED_ARG(input [`NR_BITS-1:0] rs2),
input use_imm,
`UNUSED_ARG(input [`XLEN-1:0] imm)
);
`ifdef FLEN_64
logic fdst_d = imm[0];
`else
logic fdst_d = 0;
`endif
`ifdef XLEN_64
logic fcvt_l = imm[1];
`else
logic fcvt_l = 0;
`endif
`ifdef EXT_F_ENABLE
logic rd_float = 1'(rd >> 5) || 1'(rs2 >> 5);
`else
logic rd_float = 0;
`endif
case (ex_type)
`EX_ALU: begin
if (`INST_ALU_IS_BR(op_mod)) begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"));
`INST_BR_NE: `TRACE(level, ("BNE"));
`INST_BR_LT: `TRACE(level, ("BLT"));
`INST_BR_GE: `TRACE(level, ("BGE"));
`INST_BR_LTU: `TRACE(level, ("BLTU"));
`INST_BR_GEU: `TRACE(level, ("BGEU"));
`INST_BR_JAL: `TRACE(level, ("JAL"));
`INST_BR_JALR: `TRACE(level, ("JALR"));
`INST_BR_ECALL: `TRACE(level, ("ECALL"));
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"));
`INST_BR_URET: `TRACE(level, ("URET"));
`INST_BR_SRET: `TRACE(level, ("SRET"));
`INST_BR_MRET: `TRACE(level, ("MRET"));
default: `TRACE(level, ("?"));
endcase
end else if (`INST_ALU_IS_M(op_mod)) begin
if (`INST_ALU_IS_W(op_mod)) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"));
`INST_M_DIV: `TRACE(level, ("DIVW"));
`INST_M_DIVU: `TRACE(level, ("DIVUW"));
`INST_M_REM: `TRACE(level, ("REMW"));
`INST_M_REMU: `TRACE(level, ("REMUW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"));
`INST_M_MULH: `TRACE(level, ("MULH"));
`INST_M_MULHSU:`TRACE(level, ("MULHSU"));
`INST_M_MULHU: `TRACE(level, ("MULHU"));
`INST_M_DIV: `TRACE(level, ("DIV"));
`INST_M_DIVU: `TRACE(level, ("DIVU"));
`INST_M_REM: `TRACE(level, ("REM"));
`INST_M_REMU: `TRACE(level, ("REMU"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (`INST_ALU_IS_W(op_mod)) begin
if (use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
`INST_ALU_SRL: `TRACE(level, ("SRLIW"));
`INST_ALU_SRA: `TRACE(level, ("SRAIW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"));
`INST_ALU_SUB: `TRACE(level, ("SUBW"));
`INST_ALU_SLL: `TRACE(level, ("SLLW"));
`INST_ALU_SRL: `TRACE(level, ("SRLW"));
`INST_ALU_SRA: `TRACE(level, ("SRAW"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
`INST_ALU_SRL: `TRACE(level, ("SRLI"));
`INST_ALU_SRA: `TRACE(level, ("SRAI"));
`INST_ALU_SLT: `TRACE(level, ("SLTI"));
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"));
`INST_ALU_XOR: `TRACE(level, ("XORI"));
`INST_ALU_OR: `TRACE(level, ("ORI"));
`INST_ALU_AND: `TRACE(level, ("ANDI"));
`INST_ALU_LUI: `TRACE(level, ("LUI"));
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"));
`INST_ALU_SUB: `TRACE(level, ("SUB"));
`INST_ALU_SLL: `TRACE(level, ("SLL"));
`INST_ALU_SRL: `TRACE(level, ("SRL"));
`INST_ALU_SRA: `TRACE(level, ("SRA"));
`INST_ALU_SLT: `TRACE(level, ("SLT"));
`INST_ALU_SLTU: `TRACE(level, ("SLTU"));
`INST_ALU_XOR: `TRACE(level, ("XOR"));
`INST_ALU_OR: `TRACE(level, ("OR"));
`INST_ALU_AND: `TRACE(level, ("AND"));
default: `TRACE(level, ("?"));
endcase
end
end
end
end
`EX_LSU: begin
if (rd_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"));
`INST_LSU_LD: `TRACE(level, ("FLD"));
`INST_LSU_SW: `TRACE(level, ("FSW"));
`INST_LSU_SD: `TRACE(level, ("FSD"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"));
`INST_LSU_LH: `TRACE(level, ("LH"));
`INST_LSU_LW: `TRACE(level, ("LW"));
`INST_LSU_LD: `TRACE(level, ("LD"));
`INST_LSU_LBU:`TRACE(level, ("LBU"));
`INST_LSU_LHU:`TRACE(level, ("LHU"));
`INST_LSU_LWU:`TRACE(level, ("LWU"));
`INST_LSU_SB: `TRACE(level, ("SB"));
`INST_LSU_SH: `TRACE(level, ("SH"));
`INST_LSU_SW: `TRACE(level, ("SW"));
`INST_LSU_SD: `TRACE(level, ("SD"));
`INST_LSU_FENCE:`TRACE(level,("FENCE"));
default: `TRACE(level, ("?"));
endcase
end
end
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (fdst_d)
`TRACE(level, ("FADD.D"));
else
`TRACE(level, ("FADD.S"));
end
`INST_FPU_SUB: begin
if (fdst_d)
`TRACE(level, ("FSUB.D"));
else
`TRACE(level, ("FSUB.S"));
end
`INST_FPU_MUL: begin
if (fdst_d)
`TRACE(level, ("FMUL.D"));
else
`TRACE(level, ("FMUL.S"));
end
`INST_FPU_DIV: begin
if (fdst_d)
`TRACE(level, ("FDIV.D"));
else
`TRACE(level, ("FDIV.S"));
end
`INST_FPU_SQRT: begin
if (fdst_d)
`TRACE(level, ("FSQRT.D"));
else
`TRACE(level, ("FSQRT.S"));
end
`INST_FPU_MADD: begin
if (fdst_d)
`TRACE(level, ("FMADD.D"));
else
`TRACE(level, ("FMADD.S"));
end
`INST_FPU_MSUB: begin
if (fdst_d)
`TRACE(level, ("FMSUB.D"));
else
`TRACE(level, ("FMSUB.S"));
end
`INST_FPU_NMADD: begin
if (fdst_d)
`TRACE(level, ("FNMADD.D"));
else
`TRACE(level, ("FNMADD.S"));
end
`INST_FPU_NMSUB: begin
if (fdst_d)
`TRACE(level, ("FNMSUB.D"));
else
`TRACE(level, ("FNMSUB.S"));
end
`INST_FPU_CMP: begin
if (fdst_d) begin
case (op_mod[1:0])
0: `TRACE(level, ("FLE.D"));
1: `TRACE(level, ("FLT.D"));
2: `TRACE(level, ("FEQ.D"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (op_mod[1:0])
0: `TRACE(level, ("FLE.S"));
1: `TRACE(level, ("FLT.S"));
2: `TRACE(level, ("FEQ.S"));
default: `TRACE(level, ("?"));
endcase
end
end
`INST_FPU_F2F: begin
if (fdst_d) begin
`TRACE(level, ("FCVT.D.S"));
end else begin
`TRACE(level, ("FCVT.S.D"));
end
end
`INST_FPU_F2I: begin
if (fdst_d) begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.L.D"));
end else begin
`TRACE(level, ("FCVT.W.D"));
end
end else begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.L.S"));
end else begin
`TRACE(level, ("FCVT.W.S"));
end
end
end
`INST_FPU_F2U: begin
if (fdst_d) begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.LU.D"));
end else begin
`TRACE(level, ("FCVT.WU.D"));
end
end else begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.LU.S"));
end else begin
`TRACE(level, ("FCVT.WU.S"));
end
end
end
`INST_FPU_I2F: begin
if (fdst_d) begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.D.L"));
end else begin
`TRACE(level, ("FCVT.D.W"));
end
end else begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.S.L"));
end else begin
`TRACE(level, ("FCVT.S.W"));
end
end
end
`INST_FPU_U2F: begin
if (fdst_d) begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.D.LU"));
end else begin
`TRACE(level, ("FCVT.D.WU"));
end
end else begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.S.LU"));
end else begin
`TRACE(level, ("FCVT.S.WU"));
end
end
end
`INST_FPU_MISC: begin
if (fdst_d) begin
case (op_mod)
0: `TRACE(level, ("FSGNJ.D"));
1: `TRACE(level, ("FSGNJN.D"));
2: `TRACE(level, ("FSGNJX.D"));
3: `TRACE(level, ("FCLASS.D"));
4: `TRACE(level, ("FMV.X.D"));
5: `TRACE(level, ("FMV.D.X"));
6: `TRACE(level, ("FMIN.D"));
7: `TRACE(level, ("FMAX.D"));
endcase
end else begin
case (op_mod)
0: `TRACE(level, ("FSGNJ.S"));
1: `TRACE(level, ("FSGNJN.S"));
2: `TRACE(level, ("FSGNJX.S"));
3: `TRACE(level, ("FCLASS.S"));
4: `TRACE(level, ("FMV.X.S"));
5: `TRACE(level, ("FMV.S.X"));
6: `TRACE(level, ("FMIN.S"));
7: `TRACE(level, ("FMAX.S"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"));
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
`INST_SFU_SPLIT: `TRACE(level, ("SPLIT"));
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: `TRACE(level, ("PRED"));
`INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
default: `TRACE(level, ("?"));
endcase
end
default: `TRACE(level, ("?"));
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
default: `TRACE(level, ("?"));
endcase
endtask
`endif
`endif // VX_TRACE_VH

157
hw/rtl/core/VX_wctl_unit.sv Normal file
View File

@@ -0,0 +1,157 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_wctl_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
tmc_t tmc, tmc_r;
wspawn_t wspawn, wspawn_r;
split_t split, split_r;
join_t sjoin, sjoin_r;
barrier_t barrier, barrier_r;
wire is_wspawn = (execute_if.data.op_type == `INST_SFU_WSPAWN);
wire is_tmc = (execute_if.data.op_type == `INST_SFU_TMC);
wire is_pred = (execute_if.data.op_type == `INST_SFU_PRED);
wire is_split = (execute_if.data.op_type == `INST_SFU_SPLIT);
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [LANE_WIDTH-1:0] tid;
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin
assign tid = 0;
end
wire [`XLEN-1:0] rs1_data = execute_if.data.rs1_data[tid];
wire [`XLEN-1:0] rs2_data = execute_if.data.rs2_data[tid];
`UNUSED_VAR (rs1_data)
wire [NUM_LANES-1:0] taken;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign taken[i] = execute_if.data.rs1_data[i][0];
end
reg [`NUM_THREADS-1:0] then_tmask_r, then_tmask_n;
reg [`NUM_THREADS-1:0] else_tmask_r, else_tmask_n;
always @(*) begin
then_tmask_n = then_tmask_r;
else_tmask_n = else_tmask_r;
if (execute_if.data.sop) begin
then_tmask_n = '0;
else_tmask_n = '0;
end
then_tmask_n[execute_if.data.pid * NUM_LANES +: NUM_LANES] = taken & execute_if.data.tmask;
else_tmask_n[execute_if.data.pid * NUM_LANES +: NUM_LANES] = ~taken & execute_if.data.tmask;
end
always @(posedge clk) begin
if (execute_if.valid) begin
then_tmask_r <= then_tmask_n;
else_tmask_r <= else_tmask_n;
end
end
wire has_then = (then_tmask_n != 0);
wire has_else = (else_tmask_n != 0);
// tmc / pred
wire [`NUM_THREADS-1:0] pred_mask = has_then ? then_tmask_n : rs2_data[`NUM_THREADS-1:0];
assign tmc.valid = (is_tmc || is_pred);
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
// split
assign split.valid = is_split;
assign split.is_dvg = has_then && has_else;
assign split.then_tmask = then_tmask_n;
assign split.else_tmask = else_tmask_n;
assign split.next_pc = execute_if.data.PC + 4;
// join
assign sjoin.valid = is_join;
assign sjoin.is_dvg = rs1_data[0];
// barrier
assign barrier.valid = is_bar;
assign barrier.id = rs1_data[`NB_WIDTH-1:0];
`ifdef GBAR_ENABLE
assign barrier.is_global = rs1_data[31];
`else
assign barrier.is_global = 1'b0;
`endif
assign barrier.size_m1 = rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1);
// wspawn
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
end
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
assign wspawn.pc = rs2_data;
// response
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (execute_if.valid),
.ready_in (execute_if.ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
assign warp_ctl_if.valid = commit_if.valid && commit_if.ready && commit_if.data.eop;
assign warp_ctl_if.wid = commit_if.data.wid;
assign warp_ctl_if.tmc = tmc_r;
assign warp_ctl_if.wspawn = wspawn_r;
assign warp_ctl_if.split = split_r;
assign warp_ctl_if.sjoin = sjoin_r;
assign warp_ctl_if.barrier = barrier_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = `XLEN'(split_r.is_dvg);
end
endmodule