+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
257 lines
11 KiB
Systemverilog
257 lines
11 KiB
Systemverilog
// Copyright © 2019-2023
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
`include "VX_define.vh"
|
|
|
|
module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|
parameter BLOCK_SIZE = 1,
|
|
parameter NUM_LANES = 1,
|
|
parameter OUT_REG = 0,
|
|
parameter MAX_FANOUT = `MAX_FANOUT
|
|
) (
|
|
input wire clk,
|
|
input wire reset,
|
|
|
|
// inputs
|
|
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
|
|
|
|
// outputs
|
|
VX_execute_if.master execute_if [BLOCK_SIZE]
|
|
|
|
);
|
|
`STATIC_ASSERT ((`NUM_THREADS == NUM_LANES * (`NUM_THREADS / NUM_LANES)), ("invalid parameter"))
|
|
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
|
|
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
|
|
localparam PID_BITS = `CLOG2(NUM_PACKETS);
|
|
localparam PID_WIDTH = `UP(PID_BITS);
|
|
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
|
|
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
|
|
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
|
|
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
|
|
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
|
|
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT/2));
|
|
|
|
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
|
|
localparam DATA_REGS_OFF = 0;
|
|
|
|
wire [`ISSUE_WIDTH-1:0] dispatch_valid;
|
|
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
|
|
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
|
|
|
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
|
assign dispatch_valid[i] = dispatch_if[i].valid;
|
|
assign dispatch_data[i] = dispatch_if[i].data;
|
|
assign dispatch_if[i].ready = dispatch_ready[i];
|
|
end
|
|
|
|
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
|
|
wire [BLOCK_SIZE-1:0] block_ready;
|
|
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
|
|
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
|
|
wire [BLOCK_SIZE-1:0][PID_WIDTH-1:0] block_pid;
|
|
wire [BLOCK_SIZE-1:0] block_sop;
|
|
wire [BLOCK_SIZE-1:0] block_eop;
|
|
wire [BLOCK_SIZE-1:0] block_done;
|
|
|
|
wire batch_done = (& block_done);
|
|
|
|
logic [BATCH_COUNT_W-1:0] batch_idx;
|
|
if (BATCH_COUNT != 1) begin
|
|
always @(posedge clk) begin
|
|
if (reset) begin
|
|
batch_idx <= '0;
|
|
end else if (batch_done) begin
|
|
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
|
|
end
|
|
end
|
|
end else begin
|
|
assign batch_idx = 0;
|
|
`UNUSED_VAR (batch_done)
|
|
end
|
|
|
|
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
|
|
|
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
|
|
assign issue_indices[block_idx] = issue_idx;
|
|
|
|
wire valid_p, ready_p;
|
|
|
|
if (`NUM_THREADS != NUM_LANES) begin
|
|
reg [NUM_PACKETS-1:0] sent_mask_p;
|
|
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
|
|
wire dispatch_valid_r;
|
|
reg is_first_p;
|
|
|
|
wire fire_p = valid_p && ready_p;
|
|
|
|
wire is_last_p = (start_p == end_p);
|
|
|
|
wire fire_eop = fire_p && is_last_p;
|
|
|
|
always @(posedge clk) begin
|
|
if (reset) begin
|
|
sent_mask_p <= '0;
|
|
is_first_p <= 1;
|
|
end else begin
|
|
if ((BATCH_COUNT != 1) ? batch_done : fire_eop) begin
|
|
sent_mask_p <= '0;
|
|
is_first_p <= 1;
|
|
end else if (fire_p) begin
|
|
sent_mask_p[start_p] <= 1;
|
|
is_first_p <= 0;
|
|
end
|
|
end
|
|
end
|
|
|
|
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
|
|
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
|
|
|
|
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
|
|
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
|
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
|
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
|
|
|
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
|
|
for (genvar j = 0; j < NUM_LANES; ++j) begin
|
|
localparam k = i * NUM_LANES + j;
|
|
assign per_packet_tmask[i][j] = dispatch_tmask[k];
|
|
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
|
|
assign per_packet_regs[i][1][j] = dispatch_rs2_data[k];
|
|
assign per_packet_regs[i][2][j] = dispatch_rs3_data[k];
|
|
end
|
|
end
|
|
|
|
wire [NUM_PACKETS-1:0] packet_valids;
|
|
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
|
|
|
|
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
|
|
assign packet_valids[i] = (| per_packet_tmask[i]);
|
|
assign packet_ids[i] = PID_WIDTH'(i);
|
|
end
|
|
|
|
VX_find_first #(
|
|
.N (NUM_PACKETS),
|
|
.DATAW (PID_WIDTH),
|
|
.REVERSE (0)
|
|
) find_first (
|
|
.valid_in (packet_valids & ~sent_mask_p),
|
|
.data_in (packet_ids),
|
|
.data_out (start_p_n),
|
|
`UNUSED_PIN (valid_out)
|
|
);
|
|
|
|
VX_find_first #(
|
|
.N (NUM_PACKETS),
|
|
.DATAW (PID_WIDTH),
|
|
.REVERSE (1)
|
|
) find_last (
|
|
.valid_in (packet_valids),
|
|
.data_in (packet_ids),
|
|
.data_out (end_p),
|
|
`UNUSED_PIN (valid_out)
|
|
);
|
|
|
|
VX_pipe_register #(
|
|
.DATAW (1 + PID_WIDTH),
|
|
.RESETW (1),
|
|
.DEPTH (FANOUT_ENABLE ? 1 : 0)
|
|
) pipe_reg (
|
|
.clk (clk),
|
|
.reset (reset || fire_p), // should flush on fire
|
|
.enable (1'b1),
|
|
.data_in ({dispatch_valid[issue_idx], start_p_n}),
|
|
.data_out ({dispatch_valid_r, start_p})
|
|
);
|
|
|
|
wire [NUM_LANES-1:0] tmask_p = per_packet_tmask[start_p];
|
|
wire [2:0][NUM_LANES-1:0][`XLEN-1:0] regs_p = per_packet_regs[start_p];
|
|
|
|
wire block_enable = (BATCH_COUNT == 1 || ~(& sent_mask_p));
|
|
|
|
assign valid_p = dispatch_valid_r && block_enable;
|
|
assign block_tmask[block_idx] = tmask_p;
|
|
assign block_regs[block_idx] = regs_p;
|
|
assign block_pid[block_idx] = start_p;
|
|
assign block_sop[block_idx] = is_first_p;
|
|
assign block_eop[block_idx] = is_last_p;
|
|
if (FANOUT_ENABLE) begin
|
|
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
|
|
end else begin
|
|
assign block_ready[block_idx] = ready_p && block_enable;
|
|
end
|
|
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
|
|
end else begin
|
|
assign valid_p = dispatch_valid[issue_idx];
|
|
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
|
|
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
|
assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
|
assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
|
assign block_pid[block_idx] = '0;
|
|
assign block_sop[block_idx] = 1'b1;
|
|
assign block_eop[block_idx] = 1'b1;
|
|
assign block_ready[block_idx] = ready_p;
|
|
assign block_done[block_idx] = ~valid_p || ready_p;
|
|
end
|
|
|
|
wire [ISSUE_IDX_W-1:0] wsi;
|
|
if (BATCH_COUNT != 1) begin
|
|
if (BLOCK_SIZE != 1) begin
|
|
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)};
|
|
end else begin
|
|
assign wsi = batch_idx;
|
|
end
|
|
end else begin
|
|
assign wsi = block_idx;
|
|
end
|
|
|
|
`RESET_RELAY(buf_out_reset, reset);
|
|
|
|
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi);
|
|
|
|
VX_elastic_buffer #(
|
|
.DATAW (OUT_DATAW),
|
|
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)),
|
|
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
|
|
) buf_out (
|
|
.clk (clk),
|
|
.reset (buf_out_reset),
|
|
.valid_in (valid_p),
|
|
.ready_in (ready_p),
|
|
.data_in ({
|
|
dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W],
|
|
block_wid,
|
|
block_tmask[block_idx],
|
|
dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN],
|
|
block_regs[block_idx][0],
|
|
block_regs[block_idx][1],
|
|
block_regs[block_idx][2],
|
|
block_pid[block_idx],
|
|
block_sop[block_idx],
|
|
block_eop[block_idx]}),
|
|
.data_out (execute_if[block_idx].data),
|
|
.valid_out (execute_if[block_idx].valid),
|
|
.ready_out (execute_if[block_idx].ready)
|
|
);
|
|
end
|
|
|
|
reg [`ISSUE_WIDTH-1:0] ready_in;
|
|
always @(*) begin
|
|
ready_in = 0;
|
|
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
|
|
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
|
|
end
|
|
end
|
|
assign dispatch_ready = ready_in;
|
|
|
|
endmodule
|