From 42b9d23f832d2192231e66dd55a012193ecdc860 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 17:27:51 -0700 Subject: [PATCH] tensor: Write release logic for hgmma Upon completion of an op, tensor_core_hopper sends a "ghost" commit signal down the pipeline with the `wb` and `tensor` bit set in commit_if. The scoreboard receives this signal via writeback_if and resets the inuse_tensor status bit back to zero, which unblocks the HGMMA_WAIT instruction. --- hw/rtl/core/VX_scoreboard.sv | 7 +++- hw/rtl/core/VX_tensor_hopper_core.sv | 56 +++++++++++++--------------- 2 files changed, 30 insertions(+), 33 deletions(-) diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 59886098..2a39c058 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -146,7 +146,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( // have an explicit destination register, use a separate status bit. reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor; - wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) && + wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) && (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA); wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; @@ -213,7 +213,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; `ifdef EXT_T_HOPPER wire hgmma_wait = ibuffer_if[i].valid && - (ibuffer_if[i].data.ex_type == `EX_TENSOR) && + (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) && (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT); wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]); wire operands_ready = (~(| operands_busy)) && hgmma_ready; @@ -250,6 +250,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1; end `ifdef EXT_T_HOPPER + if (writeback_fire && writeback_if[i].data.tensor) begin + inuse_tensor[ibuffer_if[i].data.wis] <= 1'b0; + end if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1; end diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index 2ecbea70..dc763d48 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -11,6 +11,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( VX_execute_if.slave execute_if, VX_commit_if.master commit_if ); + localparam NUM_LANES = `NUM_THREADS; localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary /* commit_if.data_t parts that we need to keep around: @@ -21,22 +22,17 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( - wb - rd */ - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; wire operand_enq_fire = execute_if.valid && execute_if.ready; wire commit_if_fire = commit_if.valid && commit_if.ready; - wire [DATAW-1:0] execute_if_data_enq = { - execute_if.data.uuid, - execute_if.data.wid, - execute_if.data.tmask, - execute_if.data.PC, - execute_if.data.wb, - execute_if.data.rd - // pid/sop/eop set later - }; - wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; + wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid; + wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid; + wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask; + wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC; + wire [`NUM_WARPS-1:0] execute_if_data_wb; + wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd; logic [DATAW-1:0] execute_if_data_new_rd; wire [`NUM_WARPS-1:0] metadata_queue_fulls; @@ -71,8 +67,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( .reset(reset), .push(enq), .pop(deq), - .data_in(execute_if_data_enq), - .data_out(execute_if_data_deq[i]), + .data_in({execute_if.data.uuid, execute_if.data.wid, + execute_if.data.tmask, execute_if.data.PC, + execute_if.data.wb, execute_if.data.rd}), + .data_out({execute_if_data_uuid[i], execute_if_data_wid[i], + execute_if_data_tmask[i], execute_if_data_PC[i], + execute_if_data_wb[i], execute_if_data_rd[i]}), .empty(metadata_queue_emptys[i]), `UNUSED_PIN(alm_empty), .full(metadata_queue_fulls[i]), @@ -108,11 +108,6 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin metadata_deq = 1'b1; end - - // change rd of the commit data according to state - execute_if_data_new_rd = - {execute_if_data_deq[0/*FIXME*/][DATAW-1:`NR_BITS], - (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state))}; end always @(posedge clk) begin @@ -128,19 +123,18 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; - localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; - wire [COMMIT_DATAW-1:0] commit_if_data = { - // write-back to the correct rd only when eop - ((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */ - wb_data, /* data */ - 1'b0, /* tensor */ - 1'b0, /* pid */ - 1'b1, /* sop */ - (state == 2'b11) /* eop */ - // 1'b1 /* eop */ - }; - - assign commit_if.data = commit_if_data; + assign commit_if.data.uuid = execute_if_data_uuid[0]; + assign commit_if.data.wid = execute_if_data_wid[0]; + assign commit_if.data.tmask = execute_if_data_tmask[0]; + assign commit_if.data.PC = execute_if_data_PC[0]; + assign commit_if.data.wb = (state == 2'b11); + // custom rd + assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state)); + assign commit_if.data.data = wb_data; + assign commit_if.data.tensor = (state == 2'b11); + assign commit_if.data.pid = 1'b0; + assign commit_if.data.sop = 1'b1; + assign commit_if.data.eop = (state == 2'b11); endmodule `endif