tensor: Document configuring queue depths
This commit is contained in:
@@ -318,8 +318,6 @@ module VX_tensor_octet #(
|
|||||||
output result_valid,
|
output result_valid,
|
||||||
input result_ready
|
input result_ready
|
||||||
);
|
);
|
||||||
localparam ISSUE_QUEUE_DEPTH = 4;
|
|
||||||
|
|
||||||
// 512 bits/octet * 4 octets per warp
|
// 512 bits/octet * 4 octets per warp
|
||||||
logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n;
|
logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n;
|
||||||
logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n;
|
logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n;
|
||||||
@@ -471,7 +469,7 @@ module VX_tensor_octet #(
|
|||||||
VX_tensor_dpu #(
|
VX_tensor_dpu #(
|
||||||
.ISW(ISW),
|
.ISW(ISW),
|
||||||
.OCTET(OCTET),
|
.OCTET(OCTET),
|
||||||
.ISSUE_QUEUE_DEPTH(4)
|
.ISSUE_QUEUE_DEPTH(4 /*@perf: arbtirary*/)
|
||||||
) dpu (
|
) dpu (
|
||||||
.clk(clk),
|
.clk(clk),
|
||||||
.reset(reset),
|
.reset(reset),
|
||||||
@@ -503,10 +501,9 @@ module VX_tensor_octet #(
|
|||||||
// regular, every-2-cycle commit traffic to ensure the commit pipeline is
|
// regular, every-2-cycle commit traffic to ensure the commit pipeline is
|
||||||
// used more efficiently.
|
// used more efficiently.
|
||||||
// FIXME: unnecessary?
|
// FIXME: unnecessary?
|
||||||
// TODO: This is probably oversized.
|
|
||||||
VX_fifo_queue #(
|
VX_fifo_queue #(
|
||||||
.DATAW ($bits(D_wid) + $bits(D_out)),
|
.DATAW ($bits(D_wid) + $bits(D_out)),
|
||||||
.DEPTH (2 /*`LATENCY_HMMA*/)
|
.DEPTH (2 /* arbitrary */)
|
||||||
) output_buffer (
|
) output_buffer (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
|
|||||||
@@ -4,6 +4,9 @@
|
|||||||
module VX_tensor_dpu #(
|
module VX_tensor_dpu #(
|
||||||
parameter ISW,
|
parameter ISW,
|
||||||
parameter OCTET,
|
parameter OCTET,
|
||||||
|
// @perf: has big impact on throughput. A rule of thumb is to set it to
|
||||||
|
// the pipeline length of FEDPs in order to make sure there are enough
|
||||||
|
// entries to fully saturate the pipeline, but this is still rough
|
||||||
parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA
|
parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA
|
||||||
) (
|
) (
|
||||||
input clk,
|
input clk,
|
||||||
@@ -105,7 +108,9 @@ module VX_tensor_dpu #(
|
|||||||
// need to pass along warp id's to do multithreading
|
// need to pass along warp id's to do multithreading
|
||||||
VX_fifo_queue #(
|
VX_fifo_queue #(
|
||||||
.DATAW ($bits(wid)),
|
.DATAW ($bits(wid)),
|
||||||
.DEPTH (ISSUE_QUEUE_DEPTH + ISSUE_QUEUE_DEPTH)
|
// @perf: seems to require deeper depth than the FEDP issue queues to
|
||||||
|
// not cause stalls.
|
||||||
|
.DEPTH (2 * ISSUE_QUEUE_DEPTH)
|
||||||
) wid_queue (
|
) wid_queue (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
@@ -167,7 +172,7 @@ module VX_tensor_threadgroup #(
|
|||||||
// threadgroups, so we need only 1 queue per octet for B
|
// threadgroups, so we need only 1 queue per octet for B
|
||||||
VX_fifo_queue #(
|
VX_fifo_queue #(
|
||||||
.DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)),
|
.DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)),
|
||||||
.DEPTH (ISSUE_QUEUE_DEPTH)
|
.DEPTH (ISSUE_QUEUE_DEPTH)
|
||||||
) input_buffer (
|
) input_buffer (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
|
|||||||
Reference in New Issue
Block a user