tensor: Simply metadata queue
Enqueue all different-warp reqs into the queue. There is a slight chance that an HGMMA_WAIT might be blocked from commit when there are multiple different-warp HGMMAs blocking the dequeue end, but it should be uncommon.
This commit is contained in:
@@ -24,65 +24,58 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
- wb
|
- wb
|
||||||
- rd
|
- rd
|
||||||
*/
|
*/
|
||||||
wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
|
wire [`UUID_WIDTH-1:0] execute_if_data_uuid;
|
||||||
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
|
wire [`NW_WIDTH-1:0] execute_if_data_wid;
|
||||||
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
|
wire [NUM_LANES-1:0] execute_if_data_tmask;
|
||||||
wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type;
|
wire [`INST_ALU_BITS-1:0] execute_if_data_op_type;
|
||||||
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
|
wire [`XLEN-1:0] execute_if_data_PC;
|
||||||
wire [`NUM_WARPS-1:0] execute_if_data_wb;
|
wire execute_if_data_wb;
|
||||||
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
|
wire [`NR_BITS-1:0] execute_if_data_rd;
|
||||||
|
|
||||||
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
|
wire metadata_queue_full;
|
||||||
wire [`NUM_WARPS-1:0] metadata_queue_emptys;
|
wire metadata_queue_empty;
|
||||||
// OR not AND; we don't want any warp to be full
|
// OR not AND; we don't want any warp to be full
|
||||||
wire metadata_queue_full = |(metadata_queue_fulls);
|
|
||||||
assign execute_if.ready = !metadata_queue_full;
|
assign execute_if.ready = !metadata_queue_full;
|
||||||
|
|
||||||
`RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
|
|
||||||
("runtime error: WGMMA execute not supported for warps other than 0!"))
|
|
||||||
|
|
||||||
logic metadata_deq;
|
logic metadata_deq;
|
||||||
|
|
||||||
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
// Metadata queue for commit_if. This simply copies execute_if's
|
||||||
// Metadata queue for commit_if. This simply copies execute_if's
|
// metadata and pops them in conjunction with commit fire.
|
||||||
// metadata and pops them in conjunction with commit fire.
|
//
|
||||||
//
|
// Note both HGMMA and HGMMA_WAIT will be enqueued here, interleaved
|
||||||
// This has to be separated per-warp, as otherwise requests from
|
// between different warps. There is a slight chance that an HGMMA_WAIT
|
||||||
// multiple warps can be enqueued interleaved, which makes it hard to
|
// might be blocked from commit when there are multiple different-warp
|
||||||
// ensure two consecutive dequeues are associated with the same warp for
|
// HGMMAs blocking the dequeue end, so keep an eye on those cases.
|
||||||
// commit. (FIXME: this is not strictly necessary though.)
|
|
||||||
|
|
||||||
wire operand_enq_fire = execute_if.valid && execute_if.ready;
|
wire operand_enq_fire = execute_if.valid && execute_if.ready;
|
||||||
wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
|
wire enq = operand_enq_fire;
|
||||||
// FIXME: commit only warp 0
|
wire deq = metadata_deq;
|
||||||
wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
|
|
||||||
|
|
||||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
|
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
|
||||||
VX_fifo_queue #(
|
VX_fifo_queue #(
|
||||||
.DATAW(DATAW),
|
.DATAW(DATAW),
|
||||||
.DEPTH(METADATA_QUEUE_DEPTH)
|
.DEPTH(METADATA_QUEUE_DEPTH)
|
||||||
) pending_uops (
|
) pending_uops (
|
||||||
.clk(clk),
|
.clk(clk),
|
||||||
.reset(reset),
|
.reset(reset),
|
||||||
.push(enq),
|
.push(enq),
|
||||||
.pop(deq),
|
.pop(deq),
|
||||||
.data_in({execute_if.data.uuid, execute_if.data.wid,
|
.data_in({execute_if.data.uuid, execute_if.data.wid,
|
||||||
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
|
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
|
||||||
execute_if.data.wb, execute_if.data.rd}),
|
execute_if.data.wb, execute_if.data.rd}),
|
||||||
.data_out({execute_if_data_uuid[i], execute_if_data_wid[i],
|
.data_out({execute_if_data_uuid, execute_if_data_wid,
|
||||||
execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i],
|
execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC,
|
||||||
execute_if_data_wb[i], execute_if_data_rd[i]}),
|
execute_if_data_wb, execute_if_data_rd}),
|
||||||
.empty(metadata_queue_emptys[i]),
|
.empty(metadata_queue_empty),
|
||||||
`UNUSED_PIN(alm_empty),
|
`UNUSED_PIN(alm_empty),
|
||||||
.full(metadata_queue_fulls[i]),
|
.full(metadata_queue_full),
|
||||||
`UNUSED_PIN(alm_full),
|
`UNUSED_PIN(alm_full),
|
||||||
`UNUSED_PIN(size)
|
`UNUSED_PIN(size)
|
||||||
);
|
);
|
||||||
end
|
|
||||||
|
|
||||||
// NOTE: this is not an error but tells us if backend doesn't keep up with
|
// NOTE: this is not an error but tells us if backend doesn't keep up with
|
||||||
// HGMMA calls from the kernel
|
// HGMMA calls from the kernel
|
||||||
`RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
|
// `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
|
||||||
|
|
||||||
wire initiate_ready;
|
wire initiate_ready;
|
||||||
wire writeback_valid;
|
wire writeback_valid;
|
||||||
@@ -92,12 +85,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
logic writeback_ready;
|
logic writeback_ready;
|
||||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data;
|
wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data;
|
||||||
|
|
||||||
wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
|
wire metadata_valid = !metadata_queue_empty;
|
||||||
wire hmma_wait = metadata_valid &&
|
wire hmma_wait = metadata_valid &&
|
||||||
(execute_if_data_op_type[0/*FIXME*/] == `INST_TENSOR_HGMMA_WAIT);
|
(execute_if_data_op_type == `INST_TENSOR_HGMMA_WAIT);
|
||||||
// skip HGMMA_WAIT for kickoff
|
// skip HGMMA_WAIT for kickoff
|
||||||
wire initiate_valid = metadata_valid && !hmma_wait;
|
wire initiate_valid = metadata_valid && !hmma_wait;
|
||||||
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid[0/*FIXME*/];
|
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid;
|
||||||
|
|
||||||
// we're recycling execute_if.op_type as operands_if.op_type which might
|
// we're recycling execute_if.op_type as operands_if.op_type which might
|
||||||
// have a different width; let's be safe
|
// have a different width; let's be safe
|
||||||
@@ -216,12 +209,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
commit_if.data.eop = writeback_last;
|
commit_if.data.eop = writeback_last;
|
||||||
end else begin
|
end else begin
|
||||||
commit_if.valid = metadata_valid;
|
commit_if.valid = metadata_valid;
|
||||||
commit_if.data.uuid = execute_if_data_uuid[0];
|
commit_if.data.uuid = execute_if_data_uuid;
|
||||||
commit_if.data.wid = execute_if_data_wid[0];
|
commit_if.data.wid = execute_if_data_wid;
|
||||||
commit_if.data.tmask = execute_if_data_tmask[0];
|
commit_if.data.tmask = execute_if_data_tmask;
|
||||||
commit_if.data.PC = execute_if_data_PC[0];
|
commit_if.data.PC = execute_if_data_PC;
|
||||||
commit_if.data.wb = execute_if_data_wb[0];
|
commit_if.data.wb = execute_if_data_wb;
|
||||||
commit_if.data.rd = execute_if_data_rd[0];
|
commit_if.data.rd = execute_if_data_rd;
|
||||||
commit_if.data.data = '0; // can be arbitrary as rd is zero
|
commit_if.data.data = '0; // can be arbitrary as rd is zero
|
||||||
commit_if.data.tensor = 1'b0;
|
commit_if.data.tensor = 1'b0;
|
||||||
commit_if.data.pid = 1'b0;
|
commit_if.data.pid = 1'b0;
|
||||||
|
|||||||
Reference in New Issue
Block a user