From a02773eb922c02fe516cf96756bd5a2b18b58149 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 21:55:42 -0700 Subject: [PATCH] Add more efficient dispatch_unit Instead of having a single candidate to be considered for dispatch (designated by 'batch_idx' counter), add a dispatch_unit variant that considerse all `ISSUE_WIDTH dispatch signals and picks a valid one in a round-robin manner. This increases core utilization significantly due to better overlapping of smem/tensor ops. --- hw/rtl/core/VX_dispatch_unit_sane.sv | 163 +++++++++++++++++++++++++++ hw/rtl/core/VX_lsu_unit.sv | 2 +- hw/rtl/core/VX_tensor_core.sv | 5 +- 3 files changed, 166 insertions(+), 4 deletions(-) create mode 100644 hw/rtl/core/VX_dispatch_unit_sane.sv diff --git a/hw/rtl/core/VX_dispatch_unit_sane.sv b/hw/rtl/core/VX_dispatch_unit_sane.sv new file mode 100644 index 00000000..26d2800b --- /dev/null +++ b/hw/rtl/core/VX_dispatch_unit_sane.sv @@ -0,0 +1,163 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_dispatch_unit_sane import VX_gpu_pkg::*; #( + parameter BLOCK_SIZE = 1, + parameter NUM_LANES = 1, + parameter OUT_REG = 0, + parameter MAX_FANOUT = `MAX_FANOUT +) ( + input wire clk, + input wire reset, + + // inputs + VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], + + // outputs + VX_execute_if.master execute_if [BLOCK_SIZE] + +); + `STATIC_ASSERT ((`NUM_THREADS == NUM_LANES * (`NUM_THREADS / NUM_LANES)), ("invalid parameter")) + localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE); + localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES; + localparam PID_BITS = `CLOG2(NUM_PACKETS); + localparam PID_WIDTH = `UP(PID_BITS); + localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE; + localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT); + localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH); + localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN); + localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1; + localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT/2)); + + localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS); + localparam DATA_REGS_OFF = 0; + + wire [`ISSUE_WIDTH-1:0] dispatch_valid; + wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data; + wire [`ISSUE_WIDTH-1:0] dispatch_ready; + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign dispatch_valid[i] = dispatch_if[i].valid; + assign dispatch_data[i] = dispatch_if[i].data; + assign dispatch_if[i].ready = dispatch_ready[i]; + end + + wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices; + wire [BLOCK_SIZE-1:0] block_ready; + wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask; + wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs; + wire [BLOCK_SIZE-1:0][PID_WIDTH-1:0] block_pid; + wire [BLOCK_SIZE-1:0] block_sop; + wire [BLOCK_SIZE-1:0] block_eop; + wire [BLOCK_SIZE-1:0] block_done; + + wire batch_done = (& block_done); + + logic [BATCH_COUNT_W-1:0] batch_idx; + // if (BATCH_COUNT != 1) begin + // always @(posedge clk) begin + // if (reset) begin + // batch_idx <= '0; + // end else begin + // batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done); + // end + // end + // end else begin + // assign batch_idx = 0; + // `UNUSED_VAR(batch_done) + // end + + wire dispatch_any_valid; + VX_lzc_rr #( + .N (`ISSUE_WIDTH) + ) batch_select ( + .clk (clk), + .reset (reset), + .data_in (dispatch_valid), + .data_out (batch_idx), + .valid_out (dispatch_any_valid) + ); + + `STATIC_ASSERT ((BLOCK_SIZE == 1), ("dispatch_unit_sane only supports BLOCK_SIZE == 1 for now")) + + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin + + wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx); + assign issue_indices[block_idx] = issue_idx; + + wire valid_p, ready_p; + + assign valid_p = dispatch_valid[issue_idx]; + assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS]; + assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_pid[block_idx] = '0; + assign block_sop[block_idx] = 1'b1; + assign block_eop[block_idx] = 1'b1; + assign block_ready[block_idx] = ready_p; + assign block_done[block_idx] = ~valid_p || ready_p; + + wire [ISSUE_ISW_W-1:0] isw; + if (BATCH_COUNT != 1) begin + if (BLOCK_SIZE != 1) begin + assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)}; + end else begin + assign isw = batch_idx; + end + end else begin + assign isw = block_idx; + end + + `RESET_RELAY(buf_out_reset, reset); + + wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); + + VX_elastic_buffer #( + .DATAW (OUT_DATAW), + .SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)), + .OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG)) + ) buf_out ( + .clk (clk), + .reset (buf_out_reset), + .valid_in (valid_p), + .ready_in (ready_p), + .data_in ({ + dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W], + block_wid, + block_tmask[block_idx], + dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN], + block_regs[block_idx][0], + block_regs[block_idx][1], + block_regs[block_idx][2], + block_pid[block_idx], + block_sop[block_idx], + block_eop[block_idx]}), + .data_out (execute_if[block_idx].data), + .valid_out (execute_if[block_idx].valid), + .ready_out (execute_if[block_idx].ready) + ); + end + + reg [`ISSUE_WIDTH-1:0] ready_in; + always @(*) begin + ready_in = 0; + for (integer i = 0; i < BLOCK_SIZE; ++i) begin + ready_in[issue_indices[i]] = block_ready[i] && block_eop[i]; + end + end + assign dispatch_ready = ready_in; + +endmodule diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index 63f1d4c6..20fac1d1 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -49,7 +49,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( `RESET_RELAY (dispatch_reset, reset); - VX_dispatch_unit #( + VX_dispatch_unit_sane #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), .OUT_REG (1) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index a5128272..6c9d9f6b 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -21,7 +21,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #( `RESET_RELAY (dispatch_reset, reset); - VX_dispatch_unit #( + VX_dispatch_unit_sane #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), .OUT_REG (PARTIAL_BW ? 1 : 0) @@ -177,9 +177,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; - wire commit_if_ready_override; - wire operand_enq_fire = operands_valid_synced && execute_if.ready; + wire commit_if_ready_override; wire commit_if_fire = commit_if.valid && commit_if_ready_override; wire [DATAW-1:0] execute_if_data_enq = { execute_if.data.uuid,