tensor: spurious assert, doc, remove unused param

This commit is contained in:
Hansung Kim
2024-07-27 20:53:56 -07:00
parent 4e0dcdadac
commit d4d18c2823
2 changed files with 18 additions and 18 deletions

View File

@@ -324,9 +324,6 @@ endmodule
module VX_tensor_octet #( module VX_tensor_octet #(
parameter ISW, parameter ISW,
parameter OCTET, parameter OCTET,
// RESULT_BUFFER_DEPTH = 2 gives good performance by absorbing commit
// backpressure (result_ready), although the value is arbitrary.
// RESULT_BUFFER_DEPTH = 0 eliminates result buffering.
parameter RESULT_BUFFER_DEPTH = 2 parameter RESULT_BUFFER_DEPTH = 2
) ( ) (
input clk, input clk,
@@ -385,8 +382,11 @@ module VX_tensor_octet #(
assign operands_ready = operands_ready_buf; assign operands_ready = operands_ready_buf;
typedef struct { typedef struct {
// single column of A
logic [3:0][31:0] A_half; logic [3:0][31:0] A_half;
// single row of B
logic [3:0][31:0] B_half; logic [3:0][31:0] B_half;
// interleaved elements of C
logic [7:0][31:0] C_half; logic [7:0][31:0] C_half;
} half_t; } half_t;
@@ -477,18 +477,20 @@ module VX_tensor_octet #(
wire hmma_ready; wire hmma_ready;
assign operands_ready_buf = hmma_ready; assign operands_ready_buf = hmma_ready;
// A is 4x2 fp32 matrix // all *_tiles below are row-major
// A is a 4x2 fp32 matrix
wire [3:0][1:0][31:0] A_tile = { wire [3:0][1:0][31:0] A_tile = {
{ halves_buf.A_half[3], A_buffer[operands_wid_buf][3] }, { halves_buf.A_half[3], A_buffer[operands_wid_buf][3] },
{ halves_buf.A_half[2], A_buffer[operands_wid_buf][2] }, { halves_buf.A_half[2], A_buffer[operands_wid_buf][2] },
{ halves_buf.A_half[1], A_buffer[operands_wid_buf][1] }, { halves_buf.A_half[1], A_buffer[operands_wid_buf][1] },
{ halves_buf.A_half[0], A_buffer[operands_wid_buf][0] } { halves_buf.A_half[0], A_buffer[operands_wid_buf][0] }
}; };
// B is 2x4 fp32 matrix // B is a 2x4 fp32 matrix
wire [1:0][3:0][31:0] B_tile = { wire [1:0][3:0][31:0] B_tile = {
halves_buf.B_half, B_buffer[operands_wid_buf] halves_buf.B_half,
B_buffer[operands_wid_buf]
}; };
// C is 4x4 fp32 matrix // C is a 4x4 fp32 matrix
logic [3:0][3:0][31:0] C_tile; logic [3:0][3:0][31:0] C_tile;
wire [3:0][3:0][31:0] D_tile; wire [3:0][3:0][31:0] D_tile;
wire [`NW_WIDTH-1:0] D_wid_dpu; wire [`NW_WIDTH-1:0] D_wid_dpu;
@@ -538,7 +540,10 @@ module VX_tensor_octet #(
// commit/writeback is complete. This decouples the irregular dpu // commit/writeback is complete. This decouples the irregular dpu
// output traffic from the regular, every-2-cycle commit traffic to // output traffic from the regular, every-2-cycle commit traffic to
// ensure the commit pipeline is used more efficiently. // ensure the commit pipeline is used more efficiently.
// FIXME: unnecessary? //
// @perf: RESULT_BUFFER_DEPTH == 2 gives good performance by
// completely dampening commit backpressure (result_ready).
// RESULT_BUFFER_DEPTH = 0 removes the fifo queue altogether.
VX_fifo_queue #( VX_fifo_queue #(
.DATAW ($bits(D_wid) + $bits(D_out)), .DATAW ($bits(D_wid) + $bits(D_out)),
.DEPTH (RESULT_BUFFER_DEPTH) // 2 works good .DEPTH (RESULT_BUFFER_DEPTH) // 2 works good
@@ -556,8 +561,8 @@ module VX_tensor_octet #(
`UNUSED_PIN(size) `UNUSED_PIN(size)
); );
// FIXME: overly strict; this firing doesn't mean a bug // for perf debug
`RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!")) // `RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
end else begin end else begin
// XXX: this depends on the assumption that commit stage only asserts // XXX: this depends on the assumption that commit stage only asserts
// result_ready when result_valid is true // result_ready when result_valid is true

View File

@@ -60,12 +60,11 @@ module VX_tensor_dpu #(
wire empty; wire empty;
wire full; wire full;
// sync between operand buffer and wid buffer // sync operand buffer and wid buffer
assign ready_in = !full && !wid_full; assign ready_in = !full && !wid_full;
wire [1:0] threadgroup_valids_out; wire [1:0] threadgroup_valids_out;
wire [1:0] threadgroup_readys_in; wire [1:0] threadgroup_readys_in;
// sync operand queue and wid queue
wire threadgroup_valid_in = !empty; wire threadgroup_valid_in = !empty;
wire threadgroup_fire_in = threadgroup_valid_in && &(threadgroup_readys_in); wire threadgroup_fire_in = threadgroup_valid_in && &(threadgroup_readys_in);
@@ -98,11 +97,9 @@ module VX_tensor_dpu #(
); );
// Split A_tile and C_tile by rows (0-1, 2-3) and parallelize in two // Split A_tile and C_tile by rows (0-1, 2-3) and parallelize in two
// threadgroups // threadgroups; B_tile is shared across the two threadgroups. See Figure
// // 13 in paper
// B_tile is shared across the two threadgroups; see Figure 13
VX_tensor_threadgroup #( VX_tensor_threadgroup #(
.OPERAND_BUFFER_DEPTH(OPERAND_BUFFER_DEPTH)
) threadgroup_0 ( ) threadgroup_0 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
@@ -116,7 +113,6 @@ module VX_tensor_dpu #(
.D_frag (D_tile[1:0]) .D_frag (D_tile[1:0])
); );
VX_tensor_threadgroup #( VX_tensor_threadgroup #(
.OPERAND_BUFFER_DEPTH(OPERAND_BUFFER_DEPTH)
) threadgroup_1 ( ) threadgroup_1 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
@@ -167,7 +163,6 @@ endmodule
// does (m,n,k) = (2,4,2) matmul compute over 2 cycles. // does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
// see Figure 10(b) of the paper. // see Figure 10(b) of the paper.
module VX_tensor_threadgroup #( module VX_tensor_threadgroup #(
parameter OPERAND_BUFFER_DEPTH
) ( ) (
input clk, input clk,
input reset, input reset,