diff --git a/hw/rtl/core/VX_ibuffer.sv b/hw/rtl/core/VX_ibuffer.sv index b8dc2a36..7ed5c64a 100644 --- a/hw/rtl/core/VX_ibuffer.sv +++ b/hw/rtl/core/VX_ibuffer.sv @@ -74,6 +74,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #( assign decode_if.ibuf_pop[i] = uop_sequencer_if[i].valid && uop_sequencer_if[i].ready; `endif + // tensor-core operation is controlled by a single macro-instruction at + // the ISA; internally, the uop_sequencer blitzs micro-ops (counterpart + // to Volta SASS set/step instructions) into the ibuffer upon encountering + // this macro-instruction. this becomes a pass-through for non-tensorcore + // instructions. VX_uop_sequencer uop_sequencer ( .clk(clk), .reset(reset), diff --git a/hw/rtl/core/VX_reduce_unit.sv b/hw/rtl/core/VX_reduce_unit.sv index b3000acb..8522f8d1 100644 --- a/hw/rtl/core/VX_reduce_unit.sv +++ b/hw/rtl/core/VX_reduce_unit.sv @@ -27,6 +27,7 @@ module VX_reduce_ext #( input wire [`INST_RED_BITS-1:0] op_type, output wire [DATAW_OUT-1:0] data_out ); + // recursive binary reduction if (N == 1) begin `UNUSED_VAR(op_type) `UNUSED_VAR(mask) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index a08498d1..546a1da1 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -211,12 +211,11 @@ module VX_tensor_octet #( // half the inputs are buffered, half are not (instead coming straight // from operand bus) unlike the real tensor core. // the banks are only 32 bit rather than 64 bit (a pair of fp32 regs). - // since A and B are supplied by 4 lanes each, we get 4 fp32's at a time - // (8 for C). logic [3:0][31:0] A_half; logic [3:0][31:0] B_half; logic [7:0][31:0] C_half; always @(*) begin + // note that not all lanes participate at every step case (step) 2'b00: begin A_half = { A_in[5:4], A_in[1:0] }; @@ -268,7 +267,6 @@ module VX_tensor_octet #( end end - wire stall = result_valid && ~result_ready; assign operands_ready = ~stall; diff --git a/hw/rtl/core/VX_tensor_ucode.vh b/hw/rtl/core/VX_tensor_ucode.vh index 8c3243de..5776aaa8 100644 --- a/hw/rtl/core/VX_tensor_ucode.vh +++ b/hw/rtl/core/VX_tensor_ucode.vh @@ -1,3 +1,4 @@ +// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3 HMMA_SET0_STEP0_0: begin uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)}; end diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv index b4785372..24b5af3c 100644 --- a/hw/rtl/core/VX_uop_sequencer.sv +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -119,6 +119,7 @@ module VX_uop_sequencer import VX_gpu_pkg::*; ( uop[UOP_TABLE_WIDTH-UBR_BITS-UPC_BITS-1:0] }; + // passthrough when !use_uop assign ibuffer_if.valid = use_uop ? 1'b1 : uop_sequencer_if.valid; assign uop_sequencer_if.ready = use_uop ? (uop_fire && ubr == FINISH) : ibuffer_if.ready; assign ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data;