From 9caafb2d8a153f84e88c8134bb5e6423c6fbd044 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 31 May 2024 19:17:56 -0700 Subject: [PATCH] tensor: Decode rd of macro-op to designate additional accumulator This is useful when you want to have the tensor core output to multiple accumulator registers, e.g. when doing outer product within the RF. --- hw/rtl/core/VX_decode.sv | 6 ++++++ hw/rtl/core/VX_uop_sequencer.sv | 18 ++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 6f4539e7..2ca414cd 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -545,6 +545,12 @@ module VX_decode #( `INST_EXT4: begin ex_type = `EX_TENSOR; op_type = `INST_TENSOR_HMMA; + // tensor core macroop is encoded as r-type + use_rd = 1; + `USED_IREG (rd); + `USED_IREG (rs1); + `USED_IREG (rs2); + `USED_IREG (rs3); end `endif default:; diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv index 24b5af3c..130866de 100644 --- a/hw/rtl/core/VX_uop_sequencer.sv +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -14,10 +14,9 @@ module VX_uop_sequencer import VX_gpu_pkg::*; ( localparam UOP_TABLE_SIZE = 64; localparam UPC_BITS = `CLOG2(UOP_TABLE_SIZE); - localparam NEXT = 2'b00; - localparam FINISH = 2'b01; - localparam UBR_BITS = 2; + localparam NEXT = UBR_BITS'(2'b00); + localparam FINISH = UBR_BITS'(2'b01); // uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3 localparam UOP_TABLE_WIDTH = UBR_BITS + UPC_BITS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + (`NR_BITS * 4); @@ -122,7 +121,18 @@ module VX_uop_sequencer import VX_gpu_pkg::*; ( // passthrough when !use_uop assign ibuffer_if.valid = use_uop ? 1'b1 : uop_sequencer_if.valid; assign uop_sequencer_if.ready = use_uop ? (uop_fire && ubr == FINISH) : ibuffer_if.ready; - assign ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data; + + always @(*) begin + ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data; + + if (uop_sequencer_if.valid && use_uop && + uop_sequencer_if.data.rd == `NR_BITS'(1)) begin + // a little sketchy? but shouldn't create any loop + ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); + ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8); + $display("yoooooooo! uop rd=%d\n", ibuffer_if.data.rd); + end + end always @(posedge clk) begin if (uop_start) begin