seemingly working fp32 implementation

2024-03-19 17:56:59 -07:00
parent beb3dce46d
commit 978dd3bdfe
9 changed files with 4450 additions and 0 deletions
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -385,6 +385,11 @@
 `define LATENCY_FCVT 5
 `endif

+// Tensor Core Latency
+`ifndef LATENCY_HMMA
+`define LATENCY_HMMA 4
+`endif
+
 // Icache Configurable Knobs //////////////////////////////////////////////////

 // Cache Enable
--- a/hw/rtl/fpu/VX_tensor_core.sv
+++ b/hw/rtl/fpu/VX_tensor_core.sv
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -0,0 +1,35 @@
+`include "VX_fpu_define.vh"
+
+module VX_tensor_dpu #(
+
+) (
+    input clk,
+    input reset,
+
+    input valid_in,
+    input [3:0][1:0][31:0] A_tile,
+    input [1:0][3:0][31:0] B_tile,
+    input [3:0][3:0][31:0] C_tile,
+
+    output valid_out,
+    output [3:0][3:0][31:0] D_tile
+);
+    logic [3:0][3:0][31:0] result_hmma;
+
+    always @(*) begin
+        dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma);
+    end
+    
+
+    VX_shift_register #(
+        .DATAW  (1 + $bits(D_tile)),
+        .DEPTH  (`LATENCY_HMMA),
+        .RESETW (1)
+    ) shift_reg (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (1'b1),
+        .data_in  ({valid_in, result_hmma}),
+        .data_out ({valid_out, D_tile})
+    );
+endmodule
--- a/hw/rtl/fpu/VX_tensor_tb.sv
+++ b/hw/rtl/fpu/VX_tensor_tb.sv
@@ -0,0 +1,28 @@
+`include "VX_fpu_define.vh"
+
+module VX_tensor_tb(
+    input clk,
+    input reset,
+
+    input valid_in,
+    input [3:0][1:0][31:0] A_tile,
+    input [1:0][3:0][31:0] B_tile,
+    input [3:0][3:0][31:0] C_tile,
+
+    output valid_out,
+    output [3:0][3:0][31:0] D_tile
+);
+
+    VX_tensor_dpu #() tensor_core (
+        .clk(clk),
+        .reset(reset),
+
+        .valid_in(valid_in),
+        .A_tile(A_tile),
+        .B_tile(B_tile),
+        .C_tile(C_tile),
+
+        .valid_out(valid_out),
+        .D_tile(D_tile)
+    );
+endmodule