diff --git a/hw/rtl/core/VX_ibuffer.sv b/hw/rtl/core/VX_ibuffer.sv
index b8dc2a36..7ed5c64a 100644
--- a/hw/rtl/core/VX_ibuffer.sv
+++ b/hw/rtl/core/VX_ibuffer.sv
@@ -74,6 +74,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
         assign decode_if.ibuf_pop[i] = uop_sequencer_if[i].valid && uop_sequencer_if[i].ready;
     `endif
 
+        // tensor-core operation is controlled by a single macro-instruction at
+        // the ISA; internally, the uop_sequencer blitzs micro-ops (counterpart
+        // to Volta SASS set/step instructions) into the ibuffer upon encountering
+        // this macro-instruction.  this becomes a pass-through for non-tensorcore
+        // instructions.
         VX_uop_sequencer uop_sequencer (
             .clk(clk),
             .reset(reset),
diff --git a/hw/rtl/core/VX_reduce_unit.sv b/hw/rtl/core/VX_reduce_unit.sv
index b3000acb..8522f8d1 100644
--- a/hw/rtl/core/VX_reduce_unit.sv
+++ b/hw/rtl/core/VX_reduce_unit.sv
@@ -27,6 +27,7 @@ module VX_reduce_ext #(
     input wire [`INST_RED_BITS-1:0]  op_type,
     output wire [DATAW_OUT-1:0]      data_out
 );
+    // recursive binary reduction
     if (N == 1) begin
         `UNUSED_VAR(op_type)
         `UNUSED_VAR(mask)
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index a08498d1..546a1da1 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -211,12 +211,11 @@ module VX_tensor_octet #(
     // half the inputs are buffered, half are not (instead coming straight
     // from operand bus) unlike the real tensor core.
     // the banks are only 32 bit rather than 64 bit (a pair of fp32 regs).
-    // since A and B are supplied by 4 lanes each, we get 4 fp32's at a time
-    // (8 for C).
     logic [3:0][31:0] A_half;
     logic [3:0][31:0] B_half;
     logic [7:0][31:0] C_half;
     always @(*) begin
+        // note that not all lanes participate at every step
         case (step)
             2'b00: begin
                 A_half = { A_in[5:4], A_in[1:0] };
@@ -268,7 +267,6 @@ module VX_tensor_octet #(
         end
     end
 
-    
     wire stall = result_valid && ~result_ready;
     assign operands_ready = ~stall;
 
diff --git a/hw/rtl/core/VX_tensor_ucode.vh b/hw/rtl/core/VX_tensor_ucode.vh
index 8c3243de..5776aaa8 100644
--- a/hw/rtl/core/VX_tensor_ucode.vh
+++ b/hw/rtl/core/VX_tensor_ucode.vh
@@ -1,3 +1,4 @@
+// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
 HMMA_SET0_STEP0_0: begin 
 	uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)}; 
 end 
diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv
index b4785372..24b5af3c 100644
--- a/hw/rtl/core/VX_uop_sequencer.sv
+++ b/hw/rtl/core/VX_uop_sequencer.sv
@@ -119,6 +119,7 @@ module VX_uop_sequencer import VX_gpu_pkg::*; (
         uop[UOP_TABLE_WIDTH-UBR_BITS-UPC_BITS-1:0]
     };
 
+    // passthrough when !use_uop
     assign ibuffer_if.valid = use_uop ? 1'b1 : uop_sequencer_if.valid;
     assign uop_sequencer_if.ready = use_uop ? (uop_fire && ubr == FINISH) : ibuffer_if.ready;
     assign ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data;