From 4dcbc31a88915fff35ccefd00c6e753fa5ef135a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 21:32:20 -0700 Subject: [PATCH] tensor: Separate async commit from tensor commit With this we can prioritize commit of the async hgmma instructions over the "ghost" commits from the TC. --- hw/rtl/core/VX_commit.sv | 9 ++- hw/rtl/core/VX_tensor_hopper_core.sv | 107 +++++++++++++++++++-------- 2 files changed, 83 insertions(+), 33 deletions(-) diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index 9b930818..faca0a2a 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -177,17 +177,22 @@ module VX_commit import VX_gpu_pkg::*; #( // probably want to change this at some point // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline) wire [`ISSUE_WIDTH-1:0] final_hmma; + // if this is a "ghost" commit generated from the tensor core, don't count + // toward committed + wire [`ISSUE_WIDTH-1:0] tensor_ghost; `ifdef EXT_T_ENABLE for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin // if PC is 0, this means it is not final step of a wmma, shouldn't be committed assign final_hmma[i] = (commit_if[i].data.PC != 32'b0); + // handle 'x' with ===. FIXME fix unitialization + assign tensor_ghost[i] = (commit_if[i].data.tensor == 1'b1); end `else assign final_hmma = '1; + assign tensor_ghost = '0; `endif - - wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma; + wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma & (~tensor_ghost); VX_pipe_register #( .DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)), diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index 21fad57c..2b2136b6 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -22,12 +22,13 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( - wb - rd */ - wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid; - wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid; - wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask; - wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC; - wire [`NUM_WARPS-1:0] execute_if_data_wb; - wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd; + wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid; + wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid; + wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask; + wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type; + wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC; + wire [`NUM_WARPS-1:0] execute_if_data_wb; + wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd; wire [`NUM_WARPS-1:0] metadata_queue_fulls; wire [`NUM_WARPS-1:0] metadata_queue_emptys; @@ -38,7 +39,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)), ("runtime error: WGMMA execute not supported for warps other than 0!")) - wire metadata_deq; + logic metadata_deq; for (genvar i = 0; i < `NUM_WARPS; i++) begin // Metadata queue for commit_if. This simply copies execute_if's @@ -54,7 +55,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // FIXME: commit only warp 0 wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0)); - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS; VX_fifo_queue #( .DATAW(DATAW), .DEPTH(METADATA_QUEUE_DEPTH) @@ -64,10 +65,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( .push(enq), .pop(deq), .data_in({execute_if.data.uuid, execute_if.data.wid, - execute_if.data.tmask, execute_if.data.PC, + execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC, execute_if.data.wb, execute_if.data.rd}), .data_out({execute_if_data_uuid[i], execute_if_data_wid[i], - execute_if_data_tmask[i], execute_if_data_PC[i], + execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i], execute_if_data_wb[i], execute_if_data_rd[i]}), .empty(metadata_queue_emptys[i]), `UNUSED_PIN(alm_empty), @@ -81,47 +82,91 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // the commit stage `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) - wire initiate_ready; // FIXME: unused + wire initiate_ready; wire writeback_valid; wire writeback_last; + logic writeback_ready; wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/]; - // dequeue metadata at the last writeback - assign metadata_deq = metadata_valid && writeback_valid && writeback_last; + wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT); + // skip HGMMA_WAIT for kickoff + wire initiate_valid = metadata_valid && not_wait; + + // we're recycling execute_if.op_type as operands_if.op_type which might + // have a different width; let's be safe + `STATIC_ASSERT((`INST_ALU_BITS == `INST_OP_BITS), + ("static assertion failed: `INST_ALU_BITS != `INST_OP_BITS")) VX_tensor_hopper_core #( ) tensor_hopper_core ( .clk(clk), .reset(reset), - .initiate_valid(metadata_valid), + .initiate_valid(initiate_valid), .initiate_wid(`NW_WIDTH'(0)/*FIXME*/), .initiate_ready(initiate_ready), .writeback_valid(writeback_valid), `UNUSED_PIN(writeback_wid), .writeback_last(writeback_last), - .writeback_ready(commit_if.ready) + .writeback_ready(writeback_ready) ); wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; - assign commit_if.valid = writeback_valid; - assign commit_if.data.uuid = execute_if_data_uuid[0]; - assign commit_if.data.wid = execute_if_data_wid[0]; - assign commit_if.data.tmask = execute_if_data_tmask[0]; - assign commit_if.data.PC = execute_if_data_PC[0]; - assign commit_if.data.wb = writeback_last; - // custom rd - assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/)); - assign commit_if.data.data = wb_data; - assign commit_if.data.tensor = writeback_last; - assign commit_if.data.pid = 1'b0; - assign commit_if.data.sop = 1'b1; - // eop is deliberately set so that we don't underflow the pending_instr - // buffer in VX_schedule. An instruction is considered committed only - // when the eop bit is set to one (see VX_commit). - assign commit_if.data.eop = writeback_last; + always @(*) begin + metadata_deq = 1'b0; + + // if there's something in the meta queue, give it priority for commit, + // since every HGMMA instructions are asynchronous and should not + // block + if (metadata_valid) begin + // block tensor core writeback + writeback_ready = 1'b0; + + commit_if.valid = metadata_valid; + commit_if.data.uuid = execute_if_data_uuid[0]; + commit_if.data.wid = execute_if_data_wid[0]; + commit_if.data.tmask = execute_if_data_tmask[0]; + commit_if.data.PC = execute_if_data_PC[0]; + commit_if.data.wb = execute_if_data_wb[0]; + commit_if.data.rd = execute_if_data_rd[0]; + commit_if.data.data = wb_data; // FIXME ? + commit_if.data.tensor = 1'b0; + commit_if.data.pid = 1'b0; + commit_if.data.sop = 1'b1; + commit_if.data.eop = 1'b1; + + // block meta queue until tensor core is ready. This will + // effectively stall further issue of async HGMMA when tensor core + // is busy with too many outstanding requests (depth of meta queue). + // be careful to not miss the commit backpressure. + metadata_deq = metadata_valid && commit_if.ready && initiate_ready; + end else begin + // allow tensor core writeback, provided there's no commit + // backpressure + writeback_ready = commit_if.ready; + + commit_if.valid = writeback_valid; + commit_if.data.uuid = '0; + commit_if.data.wid = '0; // FIXME + commit_if.data.tmask = {NUM_LANES{1'b1}}; + commit_if.data.PC = '0; + commit_if.data.wb = writeback_last; + commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/)); + commit_if.data.data = wb_data; + // mark as "ghost" commit. This will prevent this commit from + // decrementing from pending_instr buffer + commit_if.data.tensor = 1'b1; + // eop is deliberately set so that we don't underflow the pending_instr + // buffer in VX_schedule. An instruction is considered committed only + // when the eop bit is set to one (see VX_commit). + // only the last ghost commit has eop set, which will trigger + // scoreboard to clear out the busy bit. + commit_if.data.eop = writeback_last; + end + end + endmodule