tensor: spurious assert, doc, remove unused param
This commit is contained in:
@@ -324,9 +324,6 @@ endmodule
|
|||||||
module VX_tensor_octet #(
|
module VX_tensor_octet #(
|
||||||
parameter ISW,
|
parameter ISW,
|
||||||
parameter OCTET,
|
parameter OCTET,
|
||||||
// RESULT_BUFFER_DEPTH = 2 gives good performance by absorbing commit
|
|
||||||
// backpressure (result_ready), although the value is arbitrary.
|
|
||||||
// RESULT_BUFFER_DEPTH = 0 eliminates result buffering.
|
|
||||||
parameter RESULT_BUFFER_DEPTH = 2
|
parameter RESULT_BUFFER_DEPTH = 2
|
||||||
) (
|
) (
|
||||||
input clk,
|
input clk,
|
||||||
@@ -385,8 +382,11 @@ module VX_tensor_octet #(
|
|||||||
assign operands_ready = operands_ready_buf;
|
assign operands_ready = operands_ready_buf;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
// single column of A
|
||||||
logic [3:0][31:0] A_half;
|
logic [3:0][31:0] A_half;
|
||||||
|
// single row of B
|
||||||
logic [3:0][31:0] B_half;
|
logic [3:0][31:0] B_half;
|
||||||
|
// interleaved elements of C
|
||||||
logic [7:0][31:0] C_half;
|
logic [7:0][31:0] C_half;
|
||||||
} half_t;
|
} half_t;
|
||||||
|
|
||||||
@@ -477,18 +477,20 @@ module VX_tensor_octet #(
|
|||||||
wire hmma_ready;
|
wire hmma_ready;
|
||||||
assign operands_ready_buf = hmma_ready;
|
assign operands_ready_buf = hmma_ready;
|
||||||
|
|
||||||
// A is 4x2 fp32 matrix
|
// all *_tiles below are row-major
|
||||||
|
// A is a 4x2 fp32 matrix
|
||||||
wire [3:0][1:0][31:0] A_tile = {
|
wire [3:0][1:0][31:0] A_tile = {
|
||||||
{ halves_buf.A_half[3], A_buffer[operands_wid_buf][3] },
|
{ halves_buf.A_half[3], A_buffer[operands_wid_buf][3] },
|
||||||
{ halves_buf.A_half[2], A_buffer[operands_wid_buf][2] },
|
{ halves_buf.A_half[2], A_buffer[operands_wid_buf][2] },
|
||||||
{ halves_buf.A_half[1], A_buffer[operands_wid_buf][1] },
|
{ halves_buf.A_half[1], A_buffer[operands_wid_buf][1] },
|
||||||
{ halves_buf.A_half[0], A_buffer[operands_wid_buf][0] }
|
{ halves_buf.A_half[0], A_buffer[operands_wid_buf][0] }
|
||||||
};
|
};
|
||||||
// B is 2x4 fp32 matrix
|
// B is a 2x4 fp32 matrix
|
||||||
wire [1:0][3:0][31:0] B_tile = {
|
wire [1:0][3:0][31:0] B_tile = {
|
||||||
halves_buf.B_half, B_buffer[operands_wid_buf]
|
halves_buf.B_half,
|
||||||
|
B_buffer[operands_wid_buf]
|
||||||
};
|
};
|
||||||
// C is 4x4 fp32 matrix
|
// C is a 4x4 fp32 matrix
|
||||||
logic [3:0][3:0][31:0] C_tile;
|
logic [3:0][3:0][31:0] C_tile;
|
||||||
wire [3:0][3:0][31:0] D_tile;
|
wire [3:0][3:0][31:0] D_tile;
|
||||||
wire [`NW_WIDTH-1:0] D_wid_dpu;
|
wire [`NW_WIDTH-1:0] D_wid_dpu;
|
||||||
@@ -538,7 +540,10 @@ module VX_tensor_octet #(
|
|||||||
// commit/writeback is complete. This decouples the irregular dpu
|
// commit/writeback is complete. This decouples the irregular dpu
|
||||||
// output traffic from the regular, every-2-cycle commit traffic to
|
// output traffic from the regular, every-2-cycle commit traffic to
|
||||||
// ensure the commit pipeline is used more efficiently.
|
// ensure the commit pipeline is used more efficiently.
|
||||||
// FIXME: unnecessary?
|
//
|
||||||
|
// @perf: RESULT_BUFFER_DEPTH == 2 gives good performance by
|
||||||
|
// completely dampening commit backpressure (result_ready).
|
||||||
|
// RESULT_BUFFER_DEPTH = 0 removes the fifo queue altogether.
|
||||||
VX_fifo_queue #(
|
VX_fifo_queue #(
|
||||||
.DATAW ($bits(D_wid) + $bits(D_out)),
|
.DATAW ($bits(D_wid) + $bits(D_out)),
|
||||||
.DEPTH (RESULT_BUFFER_DEPTH) // 2 works good
|
.DEPTH (RESULT_BUFFER_DEPTH) // 2 works good
|
||||||
@@ -556,8 +561,8 @@ module VX_tensor_octet #(
|
|||||||
`UNUSED_PIN(size)
|
`UNUSED_PIN(size)
|
||||||
);
|
);
|
||||||
|
|
||||||
// FIXME: overly strict; this firing doesn't mean a bug
|
// for perf debug
|
||||||
`RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
|
// `RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
|
||||||
end else begin
|
end else begin
|
||||||
// XXX: this depends on the assumption that commit stage only asserts
|
// XXX: this depends on the assumption that commit stage only asserts
|
||||||
// result_ready when result_valid is true
|
// result_ready when result_valid is true
|
||||||
|
|||||||
@@ -60,12 +60,11 @@ module VX_tensor_dpu #(
|
|||||||
|
|
||||||
wire empty;
|
wire empty;
|
||||||
wire full;
|
wire full;
|
||||||
// sync between operand buffer and wid buffer
|
// sync operand buffer and wid buffer
|
||||||
assign ready_in = !full && !wid_full;
|
assign ready_in = !full && !wid_full;
|
||||||
|
|
||||||
wire [1:0] threadgroup_valids_out;
|
wire [1:0] threadgroup_valids_out;
|
||||||
wire [1:0] threadgroup_readys_in;
|
wire [1:0] threadgroup_readys_in;
|
||||||
// sync operand queue and wid queue
|
|
||||||
wire threadgroup_valid_in = !empty;
|
wire threadgroup_valid_in = !empty;
|
||||||
wire threadgroup_fire_in = threadgroup_valid_in && &(threadgroup_readys_in);
|
wire threadgroup_fire_in = threadgroup_valid_in && &(threadgroup_readys_in);
|
||||||
|
|
||||||
@@ -98,11 +97,9 @@ module VX_tensor_dpu #(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Split A_tile and C_tile by rows (0-1, 2-3) and parallelize in two
|
// Split A_tile and C_tile by rows (0-1, 2-3) and parallelize in two
|
||||||
// threadgroups
|
// threadgroups; B_tile is shared across the two threadgroups. See Figure
|
||||||
//
|
// 13 in paper
|
||||||
// B_tile is shared across the two threadgroups; see Figure 13
|
|
||||||
VX_tensor_threadgroup #(
|
VX_tensor_threadgroup #(
|
||||||
.OPERAND_BUFFER_DEPTH(OPERAND_BUFFER_DEPTH)
|
|
||||||
) threadgroup_0 (
|
) threadgroup_0 (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
@@ -116,7 +113,6 @@ module VX_tensor_dpu #(
|
|||||||
.D_frag (D_tile[1:0])
|
.D_frag (D_tile[1:0])
|
||||||
);
|
);
|
||||||
VX_tensor_threadgroup #(
|
VX_tensor_threadgroup #(
|
||||||
.OPERAND_BUFFER_DEPTH(OPERAND_BUFFER_DEPTH)
|
|
||||||
) threadgroup_1 (
|
) threadgroup_1 (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
@@ -167,7 +163,6 @@ endmodule
|
|||||||
// does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
|
// does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
|
||||||
// see Figure 10(b) of the paper.
|
// see Figure 10(b) of the paper.
|
||||||
module VX_tensor_threadgroup #(
|
module VX_tensor_threadgroup #(
|
||||||
parameter OPERAND_BUFFER_DEPTH
|
|
||||||
) (
|
) (
|
||||||
input clk,
|
input clk,
|
||||||
input reset,
|
input reset,
|
||||||
|
|||||||
Reference in New Issue
Block a user