+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
380 lines
13 KiB
Systemverilog
380 lines
13 KiB
Systemverilog
// Copyright © 2019-2023
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
`include "VX_define.vh"
|
|
|
|
module VX_schedule import VX_gpu_pkg::*; #(
|
|
parameter CORE_ID = 0
|
|
) (
|
|
input wire clk,
|
|
input wire reset,
|
|
|
|
// configuration
|
|
input base_dcrs_t base_dcrs,
|
|
|
|
// inputsdecode_if
|
|
VX_warp_ctl_if.slave warp_ctl_if,
|
|
VX_branch_ctl_if.slave branch_ctl_if [`NUM_ALU_BLOCKS],
|
|
VX_decode_sched_if.slave decode_sched_if,
|
|
VX_commit_sched_if.slave commit_sched_if,
|
|
|
|
// outputs
|
|
VX_schedule_if.master schedule_if,
|
|
`ifdef GBAR_ENABLE
|
|
VX_gbar_bus_if.master gbar_bus_if,
|
|
`endif
|
|
VX_sched_csr_if.master sched_csr_if,
|
|
|
|
// status
|
|
output wire busy
|
|
);
|
|
`UNUSED_PARAM (CORE_ID)
|
|
|
|
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
|
|
reg [`NUM_WARPS-1:0] stalled_warps, stalled_warps_n; // set when branch/gpgpu instructions are issued
|
|
|
|
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n;
|
|
reg [`NUM_WARPS-1:0][`XLEN-1:0] warp_pcs, warp_pcs_n;
|
|
|
|
wire [`NW_WIDTH-1:0] schedule_wid;
|
|
wire [`NUM_THREADS-1:0] schedule_tmask;
|
|
wire [`XLEN-1:0] schedule_pc;
|
|
wire schedule_valid;
|
|
wire schedule_ready;
|
|
|
|
// split/join
|
|
wire join_valid;
|
|
wire join_is_dvg;
|
|
wire join_is_else;
|
|
wire [`NW_WIDTH-1:0] join_wid;
|
|
wire [`NUM_THREADS-1:0] join_tmask;
|
|
wire [`XLEN-1:0] join_pc;
|
|
|
|
reg [`PERF_CTR_BITS-1:0] cycles;
|
|
|
|
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
|
|
|
|
wire schedule_fire = schedule_valid && schedule_ready;
|
|
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
|
|
|
|
// branch
|
|
wire [`NUM_ALU_BLOCKS-1:0] branch_valid;
|
|
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
|
|
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
|
|
wire [`NUM_ALU_BLOCKS-1:0][`XLEN-1:0] branch_dest;
|
|
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
|
|
assign branch_valid[i] = branch_ctl_if[i].valid;
|
|
assign branch_wid[i] = branch_ctl_if[i].wid;
|
|
assign branch_taken[i] = branch_ctl_if[i].taken;
|
|
assign branch_dest[i] = branch_ctl_if[i].dest;
|
|
end
|
|
|
|
// barriers
|
|
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n;
|
|
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
|
|
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count;
|
|
wire [`NUM_WARPS-1:0] curr_barrier_mask;
|
|
`ifdef GBAR_ENABLE
|
|
reg [`NUM_WARPS-1:0] curr_barrier_mask_n;
|
|
reg gbar_req_valid;
|
|
reg [`NB_WIDTH-1:0] gbar_req_id;
|
|
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
|
|
`endif
|
|
|
|
assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
|
|
`POP_COUNT(active_barrier_count, curr_barrier_mask);
|
|
`UNUSED_VAR (active_barrier_count)
|
|
|
|
always @(*) begin
|
|
active_warps_n = active_warps;
|
|
stalled_warps_n = stalled_warps;
|
|
thread_masks_n = thread_masks;
|
|
barrier_masks_n = barrier_masks;
|
|
barrier_stalls_n= barrier_stalls;
|
|
warp_pcs_n = warp_pcs;
|
|
|
|
// wspawn handling
|
|
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
|
|
active_warps_n |= warp_ctl_if.wspawn.wmask;
|
|
for (integer i = 0; i < `NUM_WARPS; ++i) begin
|
|
if (warp_ctl_if.wspawn.wmask[i]) begin
|
|
thread_masks_n[i][0] = 1;
|
|
warp_pcs_n[i] = warp_ctl_if.wspawn.pc;
|
|
end
|
|
end
|
|
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
|
end
|
|
|
|
// TMC handling
|
|
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
|
|
active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
|
|
thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.tmc.tmask;
|
|
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
|
end
|
|
|
|
// split handling
|
|
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
|
|
if (warp_ctl_if.split.is_dvg) begin
|
|
thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.split.then_tmask;
|
|
end
|
|
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
|
end
|
|
|
|
// join handling
|
|
if (join_valid) begin
|
|
if (join_is_dvg) begin
|
|
if (join_is_else) begin
|
|
warp_pcs_n[join_wid] = join_pc;
|
|
end
|
|
thread_masks_n[join_wid] = join_tmask;
|
|
end
|
|
stalled_warps_n[join_wid] = 0; // unlock warp
|
|
end
|
|
|
|
// barrier handling
|
|
`ifdef GBAR_ENABLE
|
|
curr_barrier_mask_n = curr_barrier_mask;
|
|
curr_barrier_mask_n[warp_ctl_if.wid] = 1;
|
|
`endif
|
|
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
|
|
if (~warp_ctl_if.barrier.is_global
|
|
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
|
|
barrier_masks_n[warp_ctl_if.barrier.id] = '0;
|
|
barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id];
|
|
end else begin
|
|
barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1;
|
|
barrier_stalls_n[warp_ctl_if.wid] = 1;
|
|
end
|
|
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
|
end
|
|
`ifdef GBAR_ENABLE
|
|
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
|
|
barrier_masks_n[gbar_bus_if.rsp_id] = '0;
|
|
barrier_stalls_n = '0; // unlock all warps
|
|
end
|
|
`endif
|
|
|
|
// Branch handling
|
|
for (integer i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
|
|
if (branch_valid[i]) begin
|
|
if (branch_taken[i]) begin
|
|
warp_pcs_n[branch_wid[i]] = branch_dest[i];
|
|
end
|
|
stalled_warps_n[branch_wid[i]] = 0; // unlock warp
|
|
end
|
|
end
|
|
|
|
// decode unlock
|
|
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
|
|
stalled_warps_n[decode_sched_if.wid] = 0;
|
|
end
|
|
|
|
// CSR unlock
|
|
if (sched_csr_if.unlock_warp) begin
|
|
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
|
|
end
|
|
|
|
// stall the warp until decode stage
|
|
if (schedule_fire) begin
|
|
stalled_warps_n[schedule_wid] = 1;
|
|
end
|
|
|
|
// advance PC
|
|
if (schedule_if_fire) begin
|
|
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + 4;
|
|
end
|
|
end
|
|
|
|
`UNUSED_VAR (base_dcrs)
|
|
|
|
always @(posedge clk) begin
|
|
if (reset) begin
|
|
barrier_masks <= '0;
|
|
`ifdef GBAR_ENABLE
|
|
gbar_req_valid <= 0;
|
|
`endif
|
|
stalled_warps <= '0;
|
|
warp_pcs <= '0;
|
|
active_warps <= '0;
|
|
thread_masks <= '0;
|
|
barrier_stalls <= '0;
|
|
issued_instrs <= '0;
|
|
cycles <= '0;
|
|
|
|
// activate first warp
|
|
warp_pcs[0] <= base_dcrs.startup_addr;
|
|
active_warps[0] <= 1;
|
|
thread_masks[0][0] <= 1;
|
|
end else begin
|
|
active_warps <= active_warps_n;
|
|
stalled_warps <= stalled_warps_n;
|
|
thread_masks <= thread_masks_n;
|
|
warp_pcs <= warp_pcs_n;
|
|
barrier_masks <= barrier_masks_n;
|
|
barrier_stalls <= barrier_stalls_n;
|
|
|
|
// global barrier scheduling
|
|
`ifdef GBAR_ENABLE
|
|
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid
|
|
&& warp_ctl_if.barrier.is_global
|
|
&& (curr_barrier_mask_n == active_warps)) begin
|
|
gbar_req_valid <= 1;
|
|
gbar_req_id <= warp_ctl_if.barrier.id;
|
|
gbar_req_size_m1 <= warp_ctl_if.barrier.size_m1[`NC_WIDTH-1:0];
|
|
end
|
|
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
|
|
gbar_req_valid <= 0;
|
|
end
|
|
`endif
|
|
|
|
if (schedule_if_fire) begin
|
|
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
|
|
end
|
|
|
|
if (busy) begin
|
|
cycles <= cycles + 1;
|
|
end
|
|
end
|
|
end
|
|
|
|
// barrier handling
|
|
|
|
`ifdef GBAR_ENABLE
|
|
assign gbar_bus_if.req_valid = gbar_req_valid;
|
|
assign gbar_bus_if.req_id = gbar_req_id;
|
|
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
|
|
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
|
|
`endif
|
|
|
|
// split/join handling
|
|
|
|
`RESET_RELAY (split_join_reset, reset);
|
|
|
|
VX_split_join #(
|
|
.CORE_ID (CORE_ID)
|
|
) split_join (
|
|
.clk (clk),
|
|
.reset (split_join_reset),
|
|
.valid (warp_ctl_if.valid),
|
|
.wid (warp_ctl_if.wid),
|
|
.split (warp_ctl_if.split),
|
|
.sjoin (warp_ctl_if.sjoin),
|
|
.join_valid (join_valid),
|
|
.join_is_dvg (join_is_dvg),
|
|
.join_is_else (join_is_else),
|
|
.join_wid (join_wid),
|
|
.join_tmask (join_tmask),
|
|
.join_pc (join_pc)
|
|
);
|
|
|
|
// schedule the next ready warp
|
|
|
|
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
|
|
|
|
VX_lzc #(
|
|
.N (`NUM_WARPS),
|
|
.REVERSE (1)
|
|
) wid_select (
|
|
.data_in (ready_warps),
|
|
.data_out (schedule_wid),
|
|
.valid_out (schedule_valid)
|
|
);
|
|
|
|
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `XLEN)-1:0] schedule_data;
|
|
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
|
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
|
|
end
|
|
|
|
assign {schedule_tmask, schedule_pc} = {
|
|
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-1:(`NUM_THREADS + `XLEN)-4],
|
|
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-5:0]
|
|
};
|
|
|
|
`ifndef NDEBUG
|
|
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
|
|
reg [`UUID_WIDTH-1:0] instr_uuid;
|
|
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
|
|
always @(posedge clk) begin
|
|
if (reset) begin
|
|
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
|
|
end else if (schedule_fire) begin
|
|
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
|
|
end
|
|
end
|
|
`else
|
|
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
|
|
`endif
|
|
|
|
VX_elastic_buffer #(
|
|
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH)
|
|
) out_buf (
|
|
.clk (clk),
|
|
.reset (reset),
|
|
.valid_in (schedule_valid),
|
|
.ready_in (schedule_ready),
|
|
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
|
|
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
|
|
.valid_out (schedule_if.valid),
|
|
.ready_out (schedule_if.ready)
|
|
);
|
|
|
|
assign schedule_if.data.uuid = instr_uuid;
|
|
|
|
`RESET_RELAY (pending_instr_reset, reset);
|
|
|
|
wire no_pending_instr;
|
|
VX_pending_instr #(
|
|
.CTR_WIDTH (12),
|
|
.DECR_COUNT (`ISSUE_WIDTH),
|
|
.ALM_EMPTY (1)
|
|
) pending_instr(
|
|
.clk (clk),
|
|
.reset (pending_instr_reset),
|
|
.incr (schedule_if_fire),
|
|
.incr_wid (schedule_if.data.wid),
|
|
.decr (commit_sched_if.committed),
|
|
.decr_wid (commit_sched_if.committed_wid),
|
|
.alm_empty_wid (sched_csr_if.alm_empty_wid),
|
|
.alm_empty (sched_csr_if.alm_empty),
|
|
.empty (no_pending_instr)
|
|
);
|
|
|
|
`BUFFER_BUSY (busy, (active_warps != 0 || ~no_pending_instr), 1);
|
|
|
|
// export CSRs
|
|
assign sched_csr_if.cycles = cycles;
|
|
assign sched_csr_if.active_warps = active_warps;
|
|
assign sched_csr_if.thread_masks = thread_masks;
|
|
|
|
// timeout handling
|
|
reg [31:0] timeout_ctr;
|
|
reg timeout_enable;
|
|
always @(posedge clk) begin
|
|
if (reset) begin
|
|
timeout_ctr <= '0;
|
|
timeout_enable <= 0;
|
|
end else begin
|
|
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
|
|
timeout_enable <= 1;
|
|
end
|
|
if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin
|
|
timeout_ctr <= timeout_ctr + 1;
|
|
end else if (active_warps == 0 || active_warps != stalled_warps) begin
|
|
timeout_ctr <= '0;
|
|
end
|
|
end
|
|
end
|
|
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
|
|
|
endmodule
|