From 78df981366778e394e4db62bfdc14c916ddc9f62 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 22:01:18 -0700 Subject: [PATCH] tensor: Simply metadata queue Enqueue all different-warp reqs into the queue. There is a slight chance that an HGMMA_WAIT might be blocked from commit when there are multiple different-warp HGMMAs blocking the dequeue end, but it should be uncommon. --- hw/rtl/core/VX_tensor_hopper_core.sv | 107 +++++++++++++-------------- 1 file changed, 50 insertions(+), 57 deletions(-) diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index b03b0b0e..ad79ac1a 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -24,65 +24,58 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( - wb - rd */ - wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid; - wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid; - wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask; - wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type; - wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC; - wire [`NUM_WARPS-1:0] execute_if_data_wb; - wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd; + wire [`UUID_WIDTH-1:0] execute_if_data_uuid; + wire [`NW_WIDTH-1:0] execute_if_data_wid; + wire [NUM_LANES-1:0] execute_if_data_tmask; + wire [`INST_ALU_BITS-1:0] execute_if_data_op_type; + wire [`XLEN-1:0] execute_if_data_PC; + wire execute_if_data_wb; + wire [`NR_BITS-1:0] execute_if_data_rd; - wire [`NUM_WARPS-1:0] metadata_queue_fulls; - wire [`NUM_WARPS-1:0] metadata_queue_emptys; + wire metadata_queue_full; + wire metadata_queue_empty; // OR not AND; we don't want any warp to be full - wire metadata_queue_full = |(metadata_queue_fulls); assign execute_if.ready = !metadata_queue_full; - `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)), - ("runtime error: WGMMA execute not supported for warps other than 0!")) - logic metadata_deq; - for (genvar i = 0; i < `NUM_WARPS; i++) begin - // Metadata queue for commit_if. This simply copies execute_if's - // metadata and pops them in conjunction with commit fire. - // - // This has to be separated per-warp, as otherwise requests from - // multiple warps can be enqueued interleaved, which makes it hard to - // ensure two consecutive dequeues are associated with the same warp for - // commit. (FIXME: this is not strictly necessary though.) + // Metadata queue for commit_if. This simply copies execute_if's + // metadata and pops them in conjunction with commit fire. + // + // Note both HGMMA and HGMMA_WAIT will be enqueued here, interleaved + // between different warps. There is a slight chance that an HGMMA_WAIT + // might be blocked from commit when there are multiple different-warp + // HGMMAs blocking the dequeue end, so keep an eye on those cases. - wire operand_enq_fire = execute_if.valid && execute_if.ready; - wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i)); - // FIXME: commit only warp 0 - wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0)); + wire operand_enq_fire = execute_if.valid && execute_if.ready; + wire enq = operand_enq_fire; + wire deq = metadata_deq; - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS; - VX_fifo_queue #( - .DATAW(DATAW), - .DEPTH(METADATA_QUEUE_DEPTH) - ) pending_uops ( - .clk(clk), - .reset(reset), - .push(enq), - .pop(deq), - .data_in({execute_if.data.uuid, execute_if.data.wid, - execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC, - execute_if.data.wb, execute_if.data.rd}), - .data_out({execute_if_data_uuid[i], execute_if_data_wid[i], - execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i], - execute_if_data_wb[i], execute_if_data_rd[i]}), - .empty(metadata_queue_emptys[i]), - `UNUSED_PIN(alm_empty), - .full(metadata_queue_fulls[i]), - `UNUSED_PIN(alm_full), - `UNUSED_PIN(size) - ); - end + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS; + VX_fifo_queue #( + .DATAW(DATAW), + .DEPTH(METADATA_QUEUE_DEPTH) + ) pending_uops ( + .clk(clk), + .reset(reset), + .push(enq), + .pop(deq), + .data_in({execute_if.data.uuid, execute_if.data.wid, + execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC, + execute_if.data.wb, execute_if.data.rd}), + .data_out({execute_if_data_uuid, execute_if_data_wid, + execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC, + execute_if_data_wb, execute_if_data_rd}), + .empty(metadata_queue_empty), + `UNUSED_PIN(alm_empty), + .full(metadata_queue_full), + `UNUSED_PIN(alm_full), + `UNUSED_PIN(size) + ); // NOTE: this is not an error but tells us if backend doesn't keep up with // HGMMA calls from the kernel - `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) + // `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) wire initiate_ready; wire writeback_valid; @@ -92,12 +85,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( logic writeback_ready; wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data; - wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/]; + wire metadata_valid = !metadata_queue_empty; wire hmma_wait = metadata_valid && - (execute_if_data_op_type[0/*FIXME*/] == `INST_TENSOR_HGMMA_WAIT); + (execute_if_data_op_type == `INST_TENSOR_HGMMA_WAIT); // skip HGMMA_WAIT for kickoff wire initiate_valid = metadata_valid && !hmma_wait; - wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid[0/*FIXME*/]; + wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid; // we're recycling execute_if.op_type as operands_if.op_type which might // have a different width; let's be safe @@ -216,12 +209,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( commit_if.data.eop = writeback_last; end else begin commit_if.valid = metadata_valid; - commit_if.data.uuid = execute_if_data_uuid[0]; - commit_if.data.wid = execute_if_data_wid[0]; - commit_if.data.tmask = execute_if_data_tmask[0]; - commit_if.data.PC = execute_if_data_PC[0]; - commit_if.data.wb = execute_if_data_wb[0]; - commit_if.data.rd = execute_if_data_rd[0]; + commit_if.data.uuid = execute_if_data_uuid; + commit_if.data.wid = execute_if_data_wid; + commit_if.data.tmask = execute_if_data_tmask; + commit_if.data.PC = execute_if_data_PC; + commit_if.data.wb = execute_if_data_wb; + commit_if.data.rd = execute_if_data_rd; commit_if.data.data = '0; // can be arbitrary as rd is zero commit_if.data.tensor = 1'b0; commit_if.data.pid = 1'b0;