// Copyright © 2019-2023 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. `include "VX_define.vh" `include "VX_trace.vh" module VX_dispatch import VX_gpu_pkg::*; #( parameter CORE_ID = 0, parameter DOMAIN = WU_DOMAIN_SCALAR ) ( input wire clk, input wire reset, `ifdef PERF_ENABLE output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS], output wire [`PERF_CTR_BITS-1:0] perf_valids [`NUM_EX_UNITS], output wire [`PERF_CTR_BITS-1:0] perf_fires [`NUM_EX_UNITS], output wire [`PERF_CTR_BITS-1:0] perf_any_fire_cycles, `endif // inputs VX_operands_if.slave operands_if [`ISSUE_WIDTH], // outputs VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH], `ifdef EXT_F_ENABLE VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH], `endif `ifdef EXT_T_ENABLE VX_dispatch_if.master tensor_alu_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.master tensor_lsu_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.master tensor_ctrl_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH], `endif VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH] ); `UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (DOMAIN) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH; wire [`ISSUE_WIDTH-1:0][`NT_WIDTH-1:0] last_active_tid; wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids; for (genvar i = 0; i < `NUM_THREADS; ++i) begin assign tids[i] = `NT_WIDTH'(i); end for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin VX_find_first #( .N (`NUM_THREADS), .DATAW (`NT_WIDTH), .REVERSE (1) ) last_tid_select ( .valid_in (operands_if[i].data.tmask), .data_in (tids), .data_out (last_active_tid[i]), `UNUSED_PIN (valid_out) ); end // ALU dispatch VX_operands_if alu_operands_if[`ISSUE_WIDTH](); wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] operands_wid; wire [`ISSUE_WIDTH-1:0] operands_is_tensor; wire [`ISSUE_WIDTH-1:0] tensor_alu_allowed; wire [`ISSUE_WIDTH-1:0] tensor_ctrl_allowed; wire [`ISSUE_WIDTH-1:0] tensor_wctl_allowed; wire [`ISSUE_WIDTH-1:0] tensor_sfu_allowed; for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign operands_wid[i] = wis_to_wid(operands_if[i].data.wis, ISSUE_ISW_W'(i)); assign operands_is_tensor[i] = operands_wid[i] >= `NW_WIDTH'(`NUM_SCALAR_WARPS); assign tensor_alu_allowed[i] = !`INST_ALU_IS_M(operands_if[i].data.op_mod) && !`INST_ALU_IS_RED(operands_if[i].data.op_mod); assign tensor_ctrl_allowed[i] = (operands_if[i].data.op_type == `INST_SFU_TMC) || (operands_if[i].data.op_type == `INST_SFU_CSRRS) || (operands_if[i].data.op_type == `INST_SFU_BAR) || (operands_if[i].data.op_type == `INST_SFU_BAR_MASK); assign tensor_wctl_allowed[i] = (operands_if[i].data.op_type == `INST_SFU_BAR) || (operands_if[i].data.op_type == `INST_SFU_BAR_MASK); assign tensor_sfu_allowed[i] = tensor_ctrl_allowed[i] || tensor_wctl_allowed[i]; end for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU) && !operands_is_tensor[i]; assign alu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (alu_reset, reset); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), .OUT_REG (2) ) alu_buffer ( .clk (clk), .reset (alu_reset), .valid_in (alu_operands_if[i].valid), .ready_in (alu_operands_if[i].ready), .data_in (`TO_DISPATCH_DATA(alu_operands_if[i].data, last_active_tid[i])), .data_out (alu_dispatch_if[i].data), .valid_out (alu_dispatch_if[i].valid), .ready_out (alu_dispatch_if[i].ready) ); end `ifdef EXT_T_ENABLE // Tensor INT/control dispatch VX_operands_if tensor_alu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign tensor_alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU) && operands_is_tensor[i] && tensor_alu_allowed[i]; assign tensor_alu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (tensor_alu_reset, reset); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), .OUT_REG (2) ) tensor_alu_buffer ( .clk (clk), .reset (tensor_alu_reset), .valid_in (tensor_alu_operands_if[i].valid), .ready_in (tensor_alu_operands_if[i].ready), .data_in (`TO_DISPATCH_DATA(tensor_alu_operands_if[i].data, last_active_tid[i])), .data_out (tensor_alu_dispatch_if[i].data), .valid_out (tensor_alu_dispatch_if[i].valid), .ready_out (tensor_alu_dispatch_if[i].ready) ); end `endif // LSU dispatch VX_operands_if lsu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU) && !operands_is_tensor[i]; assign lsu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (lsu_reset, reset); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), .OUT_REG (2) ) lsu_buffer ( .clk (clk), .reset (lsu_reset), .valid_in (lsu_operands_if[i].valid), .ready_in (lsu_operands_if[i].ready), .data_in (`TO_DISPATCH_DATA(lsu_operands_if[i].data, last_active_tid[i])), .data_out (lsu_dispatch_if[i].data), .valid_out (lsu_dispatch_if[i].valid), .ready_out (lsu_dispatch_if[i].ready) ); end `ifdef EXT_T_ENABLE // Tensor LSU dispatch VX_operands_if tensor_lsu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign tensor_lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU) && operands_is_tensor[i]; assign tensor_lsu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (tensor_lsu_reset, reset); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), .OUT_REG (2) ) tensor_lsu_buffer ( .clk (clk), .reset (tensor_lsu_reset), .valid_in (tensor_lsu_operands_if[i].valid), .ready_in (tensor_lsu_operands_if[i].ready), .data_in (`TO_DISPATCH_DATA(tensor_lsu_operands_if[i].data, last_active_tid[i])), .data_out (tensor_lsu_dispatch_if[i].data), .valid_out (tensor_lsu_dispatch_if[i].valid), .ready_out (tensor_lsu_dispatch_if[i].ready) ); end `endif // FPU dispatch `ifdef EXT_F_ENABLE VX_operands_if fpu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU) && !operands_is_tensor[i]; assign fpu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (fpu_reset, reset); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), .OUT_REG (2) ) fpu_buffer ( .clk (clk), .reset (fpu_reset), .valid_in (fpu_operands_if[i].valid), .ready_in (fpu_operands_if[i].ready), .data_in (`TO_DISPATCH_DATA(fpu_operands_if[i].data, last_active_tid[i])), .data_out (fpu_dispatch_if[i].data), .valid_out (fpu_dispatch_if[i].valid), .ready_out (fpu_dispatch_if[i].ready) ); end `endif // Tensor Core dispatch `ifdef EXT_T_ENABLE VX_operands_if tensor_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign tensor_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR) && operands_is_tensor[i]; assign tensor_operands_if[i].data = operands_if[i].data; `RESET_RELAY (tensor_reset, reset); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), .OUT_REG (2) ) tensor_buffer ( .clk (clk), .reset (tensor_reset), .valid_in (tensor_operands_if[i].valid), .ready_in (tensor_operands_if[i].ready), .data_in (`TO_DISPATCH_DATA(tensor_operands_if[i].data, last_active_tid[i])), .data_out (tensor_dispatch_if[i].data), .valid_out (tensor_dispatch_if[i].valid), .ready_out (tensor_dispatch_if[i].ready) ); end `endif `ifdef EXT_T_ENABLE // Tensor control dispatch VX_operands_if tensor_ctrl_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign tensor_ctrl_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU) && operands_is_tensor[i] && tensor_ctrl_allowed[i]; assign tensor_ctrl_operands_if[i].data = operands_if[i].data; `RESET_RELAY (tensor_ctrl_reset, reset); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), .OUT_REG (2) ) tensor_ctrl_buffer ( .clk (clk), .reset (tensor_ctrl_reset), .valid_in (tensor_ctrl_operands_if[i].valid), .ready_in (tensor_ctrl_operands_if[i].ready), .data_in (`TO_DISPATCH_DATA(tensor_ctrl_operands_if[i].data, last_active_tid[i])), .data_out (tensor_ctrl_dispatch_if[i].data), .valid_out (tensor_ctrl_dispatch_if[i].valid), .ready_out (tensor_ctrl_dispatch_if[i].ready) ); end `endif // SFU dispatch VX_operands_if sfu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign sfu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU) && !operands_is_tensor[i]; assign sfu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (sfu_reset, reset); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), .OUT_REG (2) ) sfu_buffer ( .clk (clk), .reset (sfu_reset), .valid_in (sfu_operands_if[i].valid), .ready_in (sfu_operands_if[i].ready), .data_in (`TO_DISPATCH_DATA(sfu_operands_if[i].data, last_active_tid[i])), .data_out (sfu_dispatch_if[i].data), .valid_out (sfu_dispatch_if[i].valid), .ready_out (sfu_dispatch_if[i].ready) ); end // can take next request? for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU) && !operands_is_tensor[i]) || (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU) && !operands_is_tensor[i]) `ifdef EXT_F_ENABLE || (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU) && !operands_is_tensor[i]) `endif `ifdef EXT_T_ENABLE || (tensor_alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU) && operands_is_tensor[i] && tensor_alu_allowed[i]) || (tensor_lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU) && operands_is_tensor[i]) || (tensor_ctrl_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU) && operands_is_tensor[i] && tensor_ctrl_allowed[i]) || (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR) && operands_is_tensor[i]) `endif || (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU) && !operands_is_tensor[i]); end `ifdef SIMULATION for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin `RUNTIME_ASSERT( !(operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR)) || operands_is_tensor[i], ("%t: *** core%0d-dispatch-illegal-scalar-tensor-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)", $time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid) ) `RUNTIME_ASSERT( !(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_FPU)), ("%t: *** core%0d-dispatch-illegal-tensor-fpu-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)", $time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid) ) `RUNTIME_ASSERT( !(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_SFU) && !tensor_sfu_allowed[i]), ("%t: *** core%0d-dispatch-illegal-tensor-sfu-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)", $time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid) ) `RUNTIME_ASSERT( !(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_ALU) && (`INST_ALU_IS_M(operands_if[i].data.op_mod) || `INST_ALU_IS_RED(operands_if[i].data.op_mod))), ("%t: *** core%0d-dispatch-illegal-tensor-complex-alu-op: wid=%0d PC=0x%0h op=0x%0h mod=0x%0h (#%0d)", $time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.uuid) ) end `endif `ifdef PERF_ENABLE wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle_r; wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle_r; wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_fires_per_cycle_r; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_fires_per_cycle; reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle; reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_valids_per_cycle; reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_fires_per_cycle; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_valids_r; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_fires_r; reg [`PERF_CTR_BITS-1:0] perf_any_fire_cycles_r; for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin always @(*) begin perf_issue_unit_stalls_per_cycle[i] = '0; perf_issue_unit_valids_per_cycle[i] = '0; perf_issue_unit_fires_per_cycle[i] = '0; if (operands_if[i].valid && ~operands_if[i].ready) begin perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1; end if (operands_if[i].valid) begin perf_issue_unit_valids_per_cycle[i][operands_if[i].data.ex_type] = 1; end if (operands_if[i].valid && operands_if[i].ready) begin perf_issue_unit_fires_per_cycle[i][operands_if[i].data.ex_type] = 1; end end end for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin always @(*) begin perf_unit_stalls_per_cycle[i] = '0; perf_unit_valids_per_cycle[i] = '0; perf_unit_fires_per_cycle[i] = '0; for (integer isw = 0; isw < `ISSUE_WIDTH; ++isw) begin perf_unit_stalls_per_cycle[i] = perf_unit_stalls_per_cycle[i] + perf_issue_unit_stalls_per_cycle[isw][i]; perf_unit_valids_per_cycle[i] = perf_unit_valids_per_cycle[i] + perf_issue_unit_valids_per_cycle[isw][i]; perf_unit_fires_per_cycle[i] = perf_unit_fires_per_cycle[i] + perf_issue_unit_fires_per_cycle[isw][i]; end end end // VX_reduce #( // .DATAW_IN (`NUM_EX_UNITS), // .N (`ISSUE_WIDTH), // .OP ("|") // ) reduce ( // .data_in (perf_issue_unit_stalls_per_cycle), // .data_out (perf_unit_stalls_per_cycle) // ); `BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle); `BUFFER(perf_unit_valids_per_cycle_r, perf_unit_valids_per_cycle); `BUFFER(perf_unit_fires_per_cycle_r, perf_unit_fires_per_cycle); reg perf_any_fire_per_cycle; always @(*) begin perf_any_fire_per_cycle = 1'b0; for (integer i = 0; i < `NUM_EX_UNITS; ++i) begin if (perf_unit_fires_per_cycle_r[i] != '0) begin perf_any_fire_per_cycle = 1'b1; end end end for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin always @(posedge clk) begin if (reset) begin perf_stalls_r[i] <= '0; perf_valids_r[i] <= '0; perf_fires_r[i] <= '0; perf_any_fire_cycles_r <= '0; end else begin perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]); perf_valids_r[i] <= perf_valids_r[i] + `PERF_CTR_BITS'(perf_unit_valids_per_cycle_r[i]); perf_fires_r[i] <= perf_fires_r[i] + `PERF_CTR_BITS'(perf_unit_fires_per_cycle_r[i]); perf_any_fire_cycles_r <= perf_any_fire_cycles_r + `PERF_CTR_BITS'(perf_any_fire_per_cycle); end end end for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin assign perf_stalls[i] = perf_stalls_r[i]; assign perf_valids[i] = perf_valids_r[i]; assign perf_fires[i] = perf_fires_r[i]; end assign perf_any_fire_cycles = perf_any_fire_cycles_r; `endif `ifdef DBG_TRACE_CORE_PIPELINE_VCS for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin always @(posedge clk) begin if (!reset && ($time > `TRACE_STARTTIME)) begin if ((CORE_ID == 0) && alu_dispatch_if[i].valid && ((alu_dispatch_if[i].data.PC == 32'h80000010) || (alu_dispatch_if[i].data.PC == 32'h80000014))) begin `TRACE(1, ("%d: core%0d-alu-dispatch-buffer: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, wb=%0d, rd=%0d (#%0d)\n", $time, CORE_ID, i, alu_dispatch_if[i].valid, alu_dispatch_if[i].ready, wis_to_wid(alu_dispatch_if[i].data.wis, i), alu_dispatch_if[i].data.PC, alu_dispatch_if[i].data.op_type, alu_dispatch_if[i].data.op_mod, alu_dispatch_if[i].data.wb, alu_dispatch_if[i].data.rd, alu_dispatch_if[i].data.uuid)); end if (operands_if[i].valid && operands_if[i].ready) begin `TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC)); trace_ex_type(1, operands_if[i].data.ex_type); `TRACE(1, (", op=")); trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.rd, '0, operands_if[i].data.use_imm, operands_if[i].data.imm); `TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd)); `TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS); `TRACE(1, (", rs2_data=")); `TRACE_ARRAY1D(1, operands_if[i].data.rs2_data, `NUM_THREADS); `TRACE(1, (", rs3_data=")); `TRACE_ARRAY1D(1, operands_if[i].data.rs3_data, `NUM_THREADS); `TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid)); end end end end `endif endmodule