tensor: Write release logic for hgmma

Upon completion of an op, tensor_core_hopper sends a "ghost" commit
signal down the pipeline with the `wb` and `tensor` bit set in
commit_if.  The scoreboard receives this signal via writeback_if and
resets the inuse_tensor status bit back to zero, which unblocks the
HGMMA_WAIT instruction.
This commit is contained in:
Hansung Kim
2024-10-11 17:27:51 -07:00
parent 408a9b5d2a
commit 42b9d23f83
2 changed files with 30 additions and 33 deletions

View File

@@ -146,7 +146,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
// have an explicit destination register, use a separate status bit. // have an explicit destination register, use a separate status bit.
reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor; reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) && wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA); (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
@@ -213,7 +213,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
`ifdef EXT_T_HOPPER `ifdef EXT_T_HOPPER
wire hgmma_wait = ibuffer_if[i].valid && wire hgmma_wait = ibuffer_if[i].valid &&
(ibuffer_if[i].data.ex_type == `EX_TENSOR) && (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT); (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]); wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
wire operands_ready = (~(| operands_busy)) && hgmma_ready; wire operands_ready = (~(| operands_busy)) && hgmma_ready;
@@ -250,6 +250,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1; inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
end end
`ifdef EXT_T_HOPPER `ifdef EXT_T_HOPPER
if (writeback_fire && writeback_if[i].data.tensor) begin
inuse_tensor[ibuffer_if[i].data.wis] <= 1'b0;
end
if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin
inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1; inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1;
end end

View File

@@ -11,6 +11,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
VX_execute_if.slave execute_if, VX_execute_if.slave execute_if,
VX_commit_if.master commit_if VX_commit_if.master commit_if
); );
localparam NUM_LANES = `NUM_THREADS;
localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary
/* commit_if.data_t parts that we need to keep around: /* commit_if.data_t parts that we need to keep around:
@@ -21,22 +22,17 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
- wb - wb
- rd - rd
*/ */
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
wire operand_enq_fire = execute_if.valid && execute_if.ready; wire operand_enq_fire = execute_if.valid && execute_if.ready;
wire commit_if_fire = commit_if.valid && commit_if.ready; wire commit_if_fire = commit_if.valid && commit_if.ready;
wire [DATAW-1:0] execute_if_data_enq = {
execute_if.data.uuid,
execute_if.data.wid,
execute_if.data.tmask,
execute_if.data.PC,
execute_if.data.wb,
execute_if.data.rd
// pid/sop/eop set later
};
wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
wire [`NUM_WARPS-1:0] execute_if_data_wb;
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
logic [DATAW-1:0] execute_if_data_new_rd; logic [DATAW-1:0] execute_if_data_new_rd;
wire [`NUM_WARPS-1:0] metadata_queue_fulls; wire [`NUM_WARPS-1:0] metadata_queue_fulls;
@@ -71,8 +67,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
.reset(reset), .reset(reset),
.push(enq), .push(enq),
.pop(deq), .pop(deq),
.data_in(execute_if_data_enq), .data_in({execute_if.data.uuid, execute_if.data.wid,
.data_out(execute_if_data_deq[i]), execute_if.data.tmask, execute_if.data.PC,
execute_if.data.wb, execute_if.data.rd}),
.data_out({execute_if_data_uuid[i], execute_if_data_wid[i],
execute_if_data_tmask[i], execute_if_data_PC[i],
execute_if_data_wb[i], execute_if_data_rd[i]}),
.empty(metadata_queue_emptys[i]), .empty(metadata_queue_emptys[i]),
`UNUSED_PIN(alm_empty), `UNUSED_PIN(alm_empty),
.full(metadata_queue_fulls[i]), .full(metadata_queue_fulls[i]),
@@ -108,11 +108,6 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin
metadata_deq = 1'b1; metadata_deq = 1'b1;
end end
// change rd of the commit data according to state
execute_if_data_new_rd =
{execute_if_data_deq[0/*FIXME*/][DATAW-1:`NR_BITS],
(`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state))};
end end
always @(posedge clk) begin always @(posedge clk) begin
@@ -128,19 +123,18 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; assign commit_if.data.uuid = execute_if_data_uuid[0];
wire [COMMIT_DATAW-1:0] commit_if_data = { assign commit_if.data.wid = execute_if_data_wid[0];
// write-back to the correct rd only when eop assign commit_if.data.tmask = execute_if_data_tmask[0];
((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */ assign commit_if.data.PC = execute_if_data_PC[0];
wb_data, /* data */ assign commit_if.data.wb = (state == 2'b11);
1'b0, /* tensor */ // custom rd
1'b0, /* pid */ assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state));
1'b1, /* sop */ assign commit_if.data.data = wb_data;
(state == 2'b11) /* eop */ assign commit_if.data.tensor = (state == 2'b11);
// 1'b1 /* eop */ assign commit_if.data.pid = 1'b0;
}; assign commit_if.data.sop = 1'b1;
assign commit_if.data.eop = (state == 2'b11);
assign commit_if.data = commit_if_data;
endmodule endmodule
`endif `endif