tensor: Write release logic for hgmma
Upon completion of an op, tensor_core_hopper sends a "ghost" commit signal down the pipeline with the `wb` and `tensor` bit set in commit_if. The scoreboard receives this signal via writeback_if and resets the inuse_tensor status bit back to zero, which unblocks the HGMMA_WAIT instruction.
This commit is contained in:
@@ -146,7 +146,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
// have an explicit destination register, use a separate status bit.
|
// have an explicit destination register, use a separate status bit.
|
||||||
reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
|
reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
|
||||||
|
|
||||||
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
|
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
||||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
|
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
|
||||||
|
|
||||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||||
@@ -213,7 +213,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_HOPPER
|
||||||
wire hgmma_wait = ibuffer_if[i].valid &&
|
wire hgmma_wait = ibuffer_if[i].valid &&
|
||||||
(ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
|
(ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
||||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
|
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
|
||||||
wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
|
wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
|
||||||
wire operands_ready = (~(| operands_busy)) && hgmma_ready;
|
wire operands_ready = (~(| operands_busy)) && hgmma_ready;
|
||||||
@@ -250,6 +250,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
||||||
end
|
end
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_HOPPER
|
||||||
|
if (writeback_fire && writeback_if[i].data.tensor) begin
|
||||||
|
inuse_tensor[ibuffer_if[i].data.wis] <= 1'b0;
|
||||||
|
end
|
||||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin
|
if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin
|
||||||
inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1;
|
inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1;
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
VX_execute_if.slave execute_if,
|
VX_execute_if.slave execute_if,
|
||||||
VX_commit_if.master commit_if
|
VX_commit_if.master commit_if
|
||||||
);
|
);
|
||||||
|
localparam NUM_LANES = `NUM_THREADS;
|
||||||
localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary
|
localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary
|
||||||
|
|
||||||
/* commit_if.data_t parts that we need to keep around:
|
/* commit_if.data_t parts that we need to keep around:
|
||||||
@@ -21,22 +22,17 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
- wb
|
- wb
|
||||||
- rd
|
- rd
|
||||||
*/
|
*/
|
||||||
|
|
||||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
|
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
|
||||||
|
|
||||||
wire operand_enq_fire = execute_if.valid && execute_if.ready;
|
wire operand_enq_fire = execute_if.valid && execute_if.ready;
|
||||||
wire commit_if_fire = commit_if.valid && commit_if.ready;
|
wire commit_if_fire = commit_if.valid && commit_if.ready;
|
||||||
wire [DATAW-1:0] execute_if_data_enq = {
|
|
||||||
execute_if.data.uuid,
|
|
||||||
execute_if.data.wid,
|
|
||||||
execute_if.data.tmask,
|
|
||||||
execute_if.data.PC,
|
|
||||||
execute_if.data.wb,
|
|
||||||
execute_if.data.rd
|
|
||||||
// pid/sop/eop set later
|
|
||||||
};
|
|
||||||
|
|
||||||
wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
|
wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
|
||||||
|
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
|
||||||
|
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
|
||||||
|
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
|
||||||
|
wire [`NUM_WARPS-1:0] execute_if_data_wb;
|
||||||
|
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
|
||||||
logic [DATAW-1:0] execute_if_data_new_rd;
|
logic [DATAW-1:0] execute_if_data_new_rd;
|
||||||
|
|
||||||
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
|
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
|
||||||
@@ -71,8 +67,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
.reset(reset),
|
.reset(reset),
|
||||||
.push(enq),
|
.push(enq),
|
||||||
.pop(deq),
|
.pop(deq),
|
||||||
.data_in(execute_if_data_enq),
|
.data_in({execute_if.data.uuid, execute_if.data.wid,
|
||||||
.data_out(execute_if_data_deq[i]),
|
execute_if.data.tmask, execute_if.data.PC,
|
||||||
|
execute_if.data.wb, execute_if.data.rd}),
|
||||||
|
.data_out({execute_if_data_uuid[i], execute_if_data_wid[i],
|
||||||
|
execute_if_data_tmask[i], execute_if_data_PC[i],
|
||||||
|
execute_if_data_wb[i], execute_if_data_rd[i]}),
|
||||||
.empty(metadata_queue_emptys[i]),
|
.empty(metadata_queue_emptys[i]),
|
||||||
`UNUSED_PIN(alm_empty),
|
`UNUSED_PIN(alm_empty),
|
||||||
.full(metadata_queue_fulls[i]),
|
.full(metadata_queue_fulls[i]),
|
||||||
@@ -108,11 +108,6 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin
|
if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin
|
||||||
metadata_deq = 1'b1;
|
metadata_deq = 1'b1;
|
||||||
end
|
end
|
||||||
|
|
||||||
// change rd of the commit data according to state
|
|
||||||
execute_if_data_new_rd =
|
|
||||||
{execute_if_data_deq[0/*FIXME*/][DATAW-1:`NR_BITS],
|
|
||||||
(`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state))};
|
|
||||||
end
|
end
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
@@ -128,19 +123,18 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
|
wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
|
||||||
|
|
||||||
localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
assign commit_if.data.uuid = execute_if_data_uuid[0];
|
||||||
wire [COMMIT_DATAW-1:0] commit_if_data = {
|
assign commit_if.data.wid = execute_if_data_wid[0];
|
||||||
// write-back to the correct rd only when eop
|
assign commit_if.data.tmask = execute_if_data_tmask[0];
|
||||||
((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */
|
assign commit_if.data.PC = execute_if_data_PC[0];
|
||||||
wb_data, /* data */
|
assign commit_if.data.wb = (state == 2'b11);
|
||||||
1'b0, /* tensor */
|
// custom rd
|
||||||
1'b0, /* pid */
|
assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state));
|
||||||
1'b1, /* sop */
|
assign commit_if.data.data = wb_data;
|
||||||
(state == 2'b11) /* eop */
|
assign commit_if.data.tensor = (state == 2'b11);
|
||||||
// 1'b1 /* eop */
|
assign commit_if.data.pid = 1'b0;
|
||||||
};
|
assign commit_if.data.sop = 1'b1;
|
||||||
|
assign commit_if.data.eop = (state == 2'b11);
|
||||||
assign commit_if.data = commit_if_data;
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|
||||||
`endif
|
`endif
|
||||||
|
|||||||
Reference in New Issue
Block a user