tensor: Share B operand buffer between threadgroups

The two threadgroups use the same B fragment, so no need to duplicately
store them in the operand buffer.  To do this, pull the operand buffer
out of the threadgroups to the octet-level.
This commit is contained in:
Hansung Kim
2024-07-27 20:32:44 -07:00
parent 7ad3f64528
commit 4e0dcdadac
2 changed files with 123 additions and 109 deletions

View File

@@ -287,7 +287,7 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
end
VX_tensor_reg #(
.N(1)
.DATAW(1)
) staging_subcommit (
.clk(clk),
.reset(reset),
@@ -298,15 +298,15 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
endmodule
module VX_tensor_reg #(
parameter N
parameter DATAW
) (
input clk,
input reset,
input [N-1:0] d,
input [DATAW-1:0] d,
input en,
output [N-1:0] q
output [DATAW-1:0] q
);
logic [N-1:0] data;
logic [DATAW-1:0] data;
always @(posedge clk) begin
if (reset) begin
@@ -436,7 +436,7 @@ module VX_tensor_octet #(
// Staging buffer for the A/B/C half-tiles that will later be assembled
// with the other half tiles coming in on the input ports.
VX_tensor_reg #(
.N($bits(A_buffer) + $bits(B_buffer) + $bits(C_buffer))
.DATAW($bits(A_buffer) + $bits(B_buffer) + $bits(C_buffer))
) staging_abc (
.clk(clk),
.reset(reset),
@@ -464,7 +464,7 @@ module VX_tensor_octet #(
end
VX_tensor_reg #(
.N($bits(substeps))
.DATAW($bits(substeps))
) staging_substeps (
.clk(clk),
.reset(reset),
@@ -506,7 +506,7 @@ module VX_tensor_octet #(
VX_tensor_dpu #(
.ISW(ISW),
.OCTET(OCTET),
.ISSUE_QUEUE_DEPTH(4 /*@perf: arbtirary*/)
.OPERAND_BUFFER_DEPTH(4 /*@perf: arbtirary*/)
) dpu (
.clk(clk),
.reset(reset),