seemingly working fp32 implementation

This commit is contained in:
joshua
2024-03-19 17:56:59 -07:00
parent beb3dce46d
commit 978dd3bdfe
9 changed files with 4450 additions and 0 deletions

View File

@@ -385,6 +385,11 @@
`define LATENCY_FCVT 5
`endif
// Tensor Core Latency
`ifndef LATENCY_HMMA
`define LATENCY_HMMA 4
`endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable

View File

View File

@@ -0,0 +1,35 @@
`include "VX_fpu_define.vh"
module VX_tensor_dpu #(
) (
input clk,
input reset,
input valid_in,
input [3:0][1:0][31:0] A_tile,
input [1:0][3:0][31:0] B_tile,
input [3:0][3:0][31:0] C_tile,
output valid_out,
output [3:0][3:0][31:0] D_tile
);
logic [3:0][3:0][31:0] result_hmma;
always @(*) begin
dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma);
end
VX_shift_register #(
.DATAW (1 + $bits(D_tile)),
.DEPTH (`LATENCY_HMMA),
.RESETW (1)
) shift_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({valid_in, result_hmma}),
.data_out ({valid_out, D_tile})
);
endmodule

View File

@@ -0,0 +1,28 @@
`include "VX_fpu_define.vh"
module VX_tensor_tb(
input clk,
input reset,
input valid_in,
input [3:0][1:0][31:0] A_tile,
input [1:0][3:0][31:0] B_tile,
input [3:0][3:0][31:0] C_tile,
output valid_out,
output [3:0][3:0][31:0] D_tile
);
VX_tensor_dpu #() tensor_core (
.clk(clk),
.reset(reset),
.valid_in(valid_in),
.A_tile(A_tile),
.B_tile(B_tile),
.C_tile(C_tile),
.valid_out(valid_out),
.D_tile(D_tile)
);
endmodule