lkg build rollout with 16cores optimization on arria10

This commit is contained in:
Blaise Tine
2021-01-24 16:49:22 -08:00
parent 74a687e395
commit 8775f63ec4
55 changed files with 1021 additions and 868 deletions

View File

@@ -91,96 +91,157 @@ module VX_fifo_queue #(
if (used_r == ADDRW'(ALM_EMPTY+1))
alm_empty_r <= 1;
end
used_r <= used_r + ADDRW'($signed(2'(push) - 2'(pop)));
if (SIZE > 2) begin
used_r <= used_r + ADDRW'($signed(2'(push) - 2'(pop)));
end else begin // (SIZE == 2);
`IGNORE_WARNINGS_BEGIN
used_r <= used_r ^ (push ^ pop);
`IGNORE_WARNINGS_END
end
end
end
if (0 == BUFFERED) begin
if (SIZE == 2) begin
if (0 == BUFFERED) begin
if (FASTRAM) begin
`USE_FAST_BRAM reg [DATAW-1:0] shift_reg [SIZE];
always @(posedge clk) begin
if (push) begin
shift_reg[1] <= shift_reg[0];
shift_reg[0] <= data_in;
end
end
assign data_out = shift_reg[~used_r[0]];
reg [ADDRW-1:0] rd_ptr_r;
reg [ADDRW-1:0] wr_ptr_r;
always @(posedge clk) begin
if (reset) begin
rd_ptr_r <= 0;
wr_ptr_r <= 0;
end else begin
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
rd_ptr_r <= rd_ptr_r + ADDRW'(pop);
end
reg [DATAW-1:0] shift_reg [SIZE];
always @(posedge clk) begin
if (push) begin
shift_reg[1] <= shift_reg[0];
shift_reg[0] <= data_in;
end
end
assign data_out = shift_reg[~used_r[0]];
end
end else begin
reg [DATAW-1:0] data_out_r;
reg [DATAW-1:0] buffer;
always @(posedge clk) begin
if (push) begin
buffer <= data_in;
end
if (push && (empty_r || ((used_r == ADDRW'(1)) && pop))) begin
data_out_r <= data_in;
end else if (pop) begin
data_out_r <= buffer;
end
end
assign data_out = data_out_r;
end
VX_dp_ram #(
.DATAW (DATAW),
.SIZE (SIZE),
.BUFFERED (0),
.RWCHECK (1),
.FASTRAM (FASTRAM)
) dp_ram (
.clk(clk),
.waddr(wr_ptr_r),
.raddr(rd_ptr_r),
.wren(push),
.byteen(1'b1),
.rden(1'b1),
.din(data_in),
.dout(data_out)
);
end else begin
wire [DATAW-1:0] dout;
reg [DATAW-1:0] dout_r;
reg [ADDRW-1:0] wr_ptr_r;
reg [ADDRW-1:0] rd_ptr_r;
reg [ADDRW-1:0] rd_ptr_n_r;
if (0 == BUFFERED) begin
always @(posedge clk) begin
if (reset) begin
wr_ptr_r <= 0;
rd_ptr_r <= 0;
rd_ptr_n_r <= 1;
end else begin
if (push) begin
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
end
if (pop) begin
rd_ptr_r <= rd_ptr_n_r;
if (SIZE > 2) begin
rd_ptr_n_r <= rd_ptr_r + ADDRW'(2);
end else begin // (SIZE == 2);
rd_ptr_n_r <= ~rd_ptr_n_r;
reg [ADDRW-1:0] rd_ptr_r;
reg [ADDRW-1:0] wr_ptr_r;
always @(posedge clk) begin
if (reset) begin
rd_ptr_r <= 0;
wr_ptr_r <= 0;
end else begin
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
rd_ptr_r <= rd_ptr_r + ADDRW'(pop);
end
end
VX_dp_ram #(
.DATAW (DATAW),
.SIZE (SIZE),
.BUFFERED (0),
.RWCHECK (1),
.FASTRAM (FASTRAM)
) dp_ram (
.clk(clk),
.waddr(wr_ptr_r),
.raddr(rd_ptr_r),
.wren(push),
.byteen(1'b1),
.rden(1'b1),
.din(data_in),
.dout(data_out)
);
end else begin
wire [DATAW-1:0] dout;
reg [DATAW-1:0] dout_r;
reg [ADDRW-1:0] wr_ptr_r;
reg [ADDRW-1:0] rd_ptr_r;
reg [ADDRW-1:0] rd_ptr_n_r;
always @(posedge clk) begin
if (reset) begin
wr_ptr_r <= 0;
rd_ptr_r <= 0;
rd_ptr_n_r <= 1;
end else begin
if (push) begin
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
end
if (pop) begin
rd_ptr_r <= rd_ptr_n_r;
if (SIZE > 2) begin
rd_ptr_n_r <= rd_ptr_r + ADDRW'(2);
end else begin // (SIZE == 2);
rd_ptr_n_r <= ~rd_ptr_n_r;
end
end
end
end
end
VX_dp_ram #(
.DATAW (DATAW),
.SIZE (SIZE),
.BUFFERED (0),
.RWCHECK (1),
.FASTRAM (FASTRAM)
) dp_ram (
.clk(clk),
.waddr(wr_ptr_r),
.raddr(rd_ptr_n_r),
.wren(push),
.byteen(1'b1),
.rden(1'b1),
.din(data_in),
.dout(dout)
);
VX_dp_ram #(
.DATAW (DATAW),
.SIZE (SIZE),
.BUFFERED (0),
.RWCHECK (1),
.FASTRAM (FASTRAM)
) dp_ram (
.clk(clk),
.waddr(wr_ptr_r),
.raddr(rd_ptr_n_r),
.wren(push),
.byteen(1'b1),
.rden(1'b1),
.din(data_in),
.dout(dout)
);
always @(posedge clk) begin
if (push && (empty_r || ((used_r == ADDRW'(1)) && pop))) begin
dout_r <= data_in;
end else if (pop) begin
dout_r <= dout;
always @(posedge clk) begin
if (push && (empty_r || ((used_r == ADDRW'(1)) && pop))) begin
dout_r <= data_in;
end else if (pop) begin
dout_r <= dout;
end
end
end
assign data_out = dout_r;
assign data_out = dout_r;
end
end
assign empty = empty_r;

View File

@@ -25,26 +25,16 @@ module VX_fixed_arbiter #(
assign grant_valid = requests[0];
end else begin
reg [LOG_NUM_REQS-1:0] grant_index_r;
reg [NUM_REQS-1:0] grant_onehot_r;
always @(*) begin
grant_index_r = 'x;
grant_onehot_r = 'x;
for (integer i = 0; i < NUM_REQS; ++i) begin
if (requests[i]) begin
grant_index_r = LOG_NUM_REQS'(i);
grant_onehot_r = NUM_REQS'(0);
grant_onehot_r[i] = 1;
break;
end
end
end
VX_priority_encoder #(
.N (NUM_REQS)
) tid_select (
.data_in (requests),
.index (grant_index),
.onehot (grant_onehot),
.valid_out (grant_valid)
);
assign grant_index = grant_index_r;
assign grant_onehot = grant_onehot_r;
assign grant_valid = (| requests);
end
endmodule

View File

@@ -28,11 +28,12 @@ module VX_index_buffer #(
wire [ADDRW-1:0] free_index;
VX_priority_encoder #(
.DATAW (SIZE)
.N (SIZE)
) free_slots_encoder (
.data_in (free_slots_n),
.data_out (free_index),
.valid_out (free_valid)
.data_in (free_slots_n),
.index (free_index),
`UNUSED_PIN (onehot),
.valid_out (free_valid)
);
always @(*) begin

View File

@@ -72,11 +72,11 @@ module VX_matrix_arbiter #(
end
VX_onehot_encoder #(
.NUM_REQS(NUM_REQS)
.N (NUM_REQS)
) encoder (
.onehot (grant_unqual),
`UNUSED_PIN (valid),
.binary (grant_index)
.data_in (grant_unqual),
.data_out (grant_index),
`UNUSED_PIN (valid)
);
assign grant_valid = (| requests);

View File

@@ -0,0 +1,73 @@
`include "VX_platform.vh"
// Fast encoder using parallel prefix computation
// Adapter from BaseJump STL: http://bjump.org/index.html
module VX_onehot_encoder #(
parameter N = 1,
parameter REVERSE = 0,
parameter FAST = 1
) (
input wire [N-1:0] data_in,
output wire [`LOG2UP(N)-1:0] data_out,
output wire valid
);
if (FAST) begin
`IGNORE_WARNINGS_BEGIN
localparam levels_lp = $clog2(N);
localparam aligned_width_lp = 1 << $clog2(N);
wire [levels_lp:0][aligned_width_lp-1:0] addr;
wire [levels_lp:0][aligned_width_lp-1:0] v;
// base case, also handle padding for non-power of two inputs
assign v[0] = REVERSE ? (data_in << (aligned_width_lp - N)) : ((aligned_width_lp)'(data_in));
assign addr[0] = 'x;
for (genvar level = 1; level < levels_lp+1; level=level+1) begin
localparam segments_lp = 2**(levels_lp-level);
localparam segment_slot_lp = aligned_width_lp/segments_lp;
localparam segment_width_lp = level; // how many bits are needed at each level
for (genvar segment = 0; segment < segments_lp; segment=segment+1) begin
wire [1:0] vs = {
v[level-1][segment*segment_slot_lp+(segment_slot_lp >> 1)],
v[level-1][segment*segment_slot_lp]
};
assign v[level][segment*segment_slot_lp] = (| vs);
if (level == 1) begin
assign addr[level][(segment*segment_slot_lp)+:segment_width_lp] = vs[!REVERSE];
end else begin
assign addr[level][(segment*segment_slot_lp)+:segment_width_lp] = {
vs[!REVERSE],
addr[level-1][segment*segment_slot_lp+:segment_width_lp-1] | addr[level-1][segment*segment_slot_lp+(segment_slot_lp >> 1)+:segment_width_lp-1]
};
end
end
end
assign data_out = addr[levels_lp][`LOG2UP(N)-1:0];
assign valid = v[levels_lp][0];
`IGNORE_WARNINGS_END
end else begin
reg [`LOG2UP(N)-1:0] data_out_r;
reg valid_r;
always @(*) begin
data_out_r = 'x;
for (integer i = 0; i < N; i++) begin
if (data_in[i]) begin
data_out_r = `LOG2UP(N)'(i);
end
end
end
assign data_out = data_out_r;
assign valid = (| data_in);
end
endmodule

View File

@@ -1,28 +0,0 @@
`include "VX_platform.vh"
module VX_onehot_encoder #(
parameter N = 6
) (
input wire [N-1:0] onehot,
output wire [`LOG2UP(N)-1:0] binary,
output wire valid
);
reg [`LOG2UP(N)-1:0] binary_r;
reg valid_r;
always @(*) begin
binary_r = 'x;
valid_r = 1'b0;
for (integer i = 0; i < N; i++) begin
if (onehot[i]) begin
binary_r = `LOG2UP(N)'(i);
valid_r = 1'b1;
end
end
end
assign binary = binary_r;
assign valid = valid_r;
endmodule

View File

@@ -1,26 +1,73 @@
`include "VX_platform.vh"
module VX_priority_encoder #(
parameter DATAW = 1,
parameter LDATAW = `LOG2UP(DATAW)
parameter N = 1,
parameter REVERSE = 0,
parameter FAST = 1,
parameter LN = `LOG2UP(N)
) (
input wire [DATAW-1:0] data_in,
output wire [LDATAW-1:0] data_out,
output wire valid_out
);
reg [LDATAW-1:0] data_out_r;
input wire [N-1:0] data_in,
output wire [N-1:0] onehot,
output wire [LN-1:0] index,
output wire valid_out
);
always @(*) begin
data_out_r = 'x;
for (integer i = 0; i < DATAW; i++) begin
if (data_in[i]) begin
data_out_r = LDATAW'(i);
break;
if (N == 1) begin
assign onehot = data_in;
assign index = 0;
assign valid_out = data_in;
end else if (FAST) begin
wire [N-1:0] scan_lo;
VX_scan #(
.N (N),
.OP (2),
.REVERSE (REVERSE)
) scan (
.data_in (data_in),
.data_out (scan_lo)
);
if (REVERSE) begin
assign onehot = scan_lo & {1'b1, (~scan_lo[N-1:1])};
assign valid_out = scan_lo[0];
end else begin
assign onehot = scan_lo & {(~scan_lo[N-2:0]), 1'b1};
assign valid_out = scan_lo[N-1];
end
VX_onehot_encoder #(
.N (N)
) b (
.data_in (onehot),
.data_out (index),
`UNUSED_PIN (valid)
);
end else begin
reg [N-1:0] onehot_r;
reg [LN-1:0] index_r;
always @(*) begin
index_r = 'x;
onehot_r = 0;
for (integer i = 0; i < N; i++) begin
if (data_in[i]) begin
index_r = LN'(i);
onehot_r[i] = 1'b1;
break;
end
end
end
end
assign data_out = data_out_r;
assign valid_out = (| data_in);
assign index = index_r;
assign onehot = onehot_r;
assign valid_out = (| data_in);
end
endmodule

View File

@@ -2,26 +2,45 @@
module VX_reset_relay #(
parameter NUM_NODES = 1,
parameter PASSTHRU = 0
parameter DEPTH = 1,
parameter ASYNC = 0
) (
input wire clk,
input wire reset,
output wire [NUM_NODES-1:0] reset_out
);
if (PASSTHRU == 0) begin
reg [NUM_NODES-1:0] reset_r;
always @(posedge clk) begin
for (integer i = 0; i < NUM_NODES; ++i) begin
reset_r[i] <= reset;
if (DEPTH > 1) begin
`DISABLE_BRAM reg [NUM_NODES-1:0] reset_r [DEPTH-1:0];
if (ASYNC) begin
always @(posedge clk or posedge reset) begin
for (integer i = DEPTH-1; i > 0; --i)
reset_r[i] <= reset_r[i-1];
reset_r[0] <= {NUM_NODES{reset}};
end
end else begin
always @(posedge clk) begin
for (integer i = DEPTH-1; i > 0; --i)
reset_r[i] <= reset_r[i-1];
reset_r[0] <= {NUM_NODES{reset}};
end
end
assign reset_out = reset_r[DEPTH-1];
end else if (DEPTH == 1) begin
reg [NUM_NODES-1:0] reset_r;
if (ASYNC) begin
always @(posedge clk or posedge reset) begin
reset_r <= {NUM_NODES{reset}};
end
end else begin
always @(posedge clk) begin
reset_r <= {NUM_NODES{reset}};
end
end
assign reset_out = reset_r;
end else begin
`UNUSED_VAR (clk)
for (genvar i = 0; i < NUM_NODES; ++i) begin
assign reset_out[i] = reset;
end
assign reset_out = {NUM_NODES{reset}};
end
endmodule

View File

@@ -55,7 +55,8 @@ module VX_rr_arbiter #(
assign grant_index = grant_table[state];
assign grant_onehot = grant_onehot_r;
assign grant_valid = (| requests);
assign grant_valid = (| requests);
end
endmodule

60
hw/rtl/libs/VX_scan.v Normal file
View File

@@ -0,0 +1,60 @@
`include "VX_platform.vh"
// Fast Paralllel scan using Kogge-Stone style prefix tree with configurable operator
// Adapter from BaseJump STL: http://bjump.org/index.html
module VX_scan #(
parameter N = 1,
parameter OP = 0, // 0: XOR, 1: AND, 2: OR
parameter REVERSE = 0 // 0: LO->HI, 1: HI->LO
) (
input wire [N-1:0] data_in,
output wire [N-1:0] data_out
);
`IGNORE_WARNINGS_BEGIN
wire [$clog2(N):0][N-1:0] t;
// reverses bits
if (REVERSE) begin
assign t[0] = data_in;
end else begin
assign t[0] = {<<{data_in}};
end
// optimize for the common case of small and-scans
if ((N == 2) && (OP == 1)) begin
assign t[$clog2(N)] = {t[0][1], &t[0][1:0]};
end else if ((N == 3) && (OP == 1)) begin
assign t[$clog2(N)] = {t[0][2], &t[0][2:1], &t[0][2:0]};
end else if ((N == 4) && (OP == 1)) begin
assign t[$clog2(N)] = {t[0][3], &t[0][3:2], &t[0][3:1], &t[0][3:0]};
end else begin
// general case
wire [N-1:0] fill;
for (genvar i = 0; i < $clog2(N); i++) begin
wire [N-1:0] shifted = N'({fill, t[i]} >> (1<<i));
if (OP == 0) begin
assign fill = {N{1'b0}};
assign t[i+1] = t[i] ^ shifted;
end else if (OP == 1) begin
assign fill = {N{1'b1}};
assign t[i+1] = t[i] & shifted;
end else if (OP == 2) begin
assign fill = {N{1'b0}};
assign t[i+1] = t[i] | shifted;
end
end
end
// reverse bits
if (REVERSE) begin
assign data_out = t[$clog2(N)];
end else begin
for (genvar i = 0; i < N; i++) begin
assign data_out[i] = t[$clog2(N)][N-1-i];
end
end
`IGNORE_WARNINGS_END
endmodule

View File

@@ -12,7 +12,7 @@ module VX_shift_register_nr #(
input wire [DATAW-1:0] data_in,
output wire [(NTAPS*DATAW)-1:0] data_out
);
reg [DATAW-1:0] entries [DEPTH-1:0];
`USE_FAST_BRAM reg [DATAW-1:0] entries [DEPTH-1:0];
always @(posedge clk) begin
if (enable) begin
@@ -23,7 +23,7 @@ module VX_shift_register_nr #(
end
for (genvar i = 0; i < NTAPS; ++i) begin
assign data_out [i*DATAW+:DATAW] = entries [ TAPS[i*DEPTHW+:DEPTHW] ];
assign data_out [i*DATAW+:DATAW] = entries [TAPS[i*DEPTHW+:DEPTHW]];
end
endmodule
@@ -41,7 +41,7 @@ module VX_shift_register_wr #(
input wire [DATAW-1:0] data_in,
output wire [(NTAPS*DATAW)-1:0] data_out
);
reg [DEPTH-1:0][DATAW-1:0] entries;
`USE_FAST_BRAM reg [DEPTH-1:0][DATAW-1:0] entries;
if (1 == DEPTH) begin
@@ -69,7 +69,7 @@ module VX_shift_register_wr #(
end
for (genvar i = 0; i < NTAPS; ++i) begin
assign data_out [i*DATAW+:DATAW] = entries [ TAPS[i*DEPTHW+:DEPTHW] ];
assign data_out [i*DATAW+:DATAW] = entries [TAPS[i*DEPTHW+:DEPTHW]];
end
endmodule

View File

@@ -3,7 +3,9 @@
module VX_skid_buffer #(
parameter DATAW = 1,
parameter PASSTHRU = 0,
parameter NOBACKPRESSURE = 0
parameter NOBACKPRESSURE = 0,
parameter BUFFERED = 0,
parameter FASTRAM = 1
) (
input wire clk,
input wire reset,
@@ -49,44 +51,76 @@ module VX_skid_buffer #(
end else begin
reg [DATAW-1:0] data_out_r;
reg [DATAW-1:0] buffer;
reg valid_out_r;
reg use_buffer;
wire push = valid_in && ready_in;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
use_buffer <= 0;
end else begin
if (ready_out) begin
use_buffer <= 0;
if (BUFFERED) begin
reg [DATAW-1:0] data_out_r;
reg [DATAW-1:0] buffer;
reg valid_out_r;
reg use_buffer;
wire push = valid_in && ready_in;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
use_buffer <= 0;
end else begin
if (ready_out) begin
use_buffer <= 0;
end
if (push && valid_out_r && !ready_out) begin
assert(!use_buffer);
use_buffer <= 1;
end
if (!valid_out_r || ready_out) begin
valid_out_r <= valid_in || use_buffer;
end
end
if (push && valid_out_r && !ready_out) begin
assert(!use_buffer);
use_buffer <= 1;
end
always @(posedge clk) begin
if (push) begin
buffer <= data_in;
end
if (!valid_out_r || ready_out) begin
valid_out_r <= valid_in || use_buffer;
data_out_r <= use_buffer ? buffer : data_in;
end
end
assign ready_in = !use_buffer;
assign valid_out = valid_out_r;
assign data_out = data_out_r;
end else begin
wire q_push = valid_in && ready_in;
wire q_pop = valid_out && ready_out;
wire q_empty, q_full;
VX_fifo_queue #(
.DATAW (DATAW),
.SIZE (2),
.BUFFERED (BUFFERED),
.FASTRAM (FASTRAM)
) fifo (
.clk (clk),
.reset (reset),
.push (q_push),
.pop (q_pop),
.data_in (data_in),
.data_out (data_out),
.empty (q_empty),
.alm_full (q_full),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
assign ready_in = !q_full;
assign valid_out = !q_empty;
end
always @(posedge clk) begin
if (push) begin
buffer <= data_in;
end
if (!valid_out_r || ready_out) begin
data_out_r <= use_buffer ? buffer : data_in;
end
end
assign ready_in = !use_buffer;
assign valid_out = valid_out_r;
assign data_out = data_out_r;
end
endmodule

View File

@@ -92,7 +92,8 @@ module VX_stream_arbiter #(
VX_skid_buffer #(
.DATAW (DATAW),
.PASSTHRU (!BUFFERED)
.PASSTHRU (!BUFFERED),
.BUFFERED (1)
) out_buffer (
.clk (clk),
.reset (reset),

View File

@@ -40,7 +40,8 @@ module VX_stream_demux #(
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (DATAW),
.PASSTHRU (!BUFFERED)
.PASSTHRU (!BUFFERED),
.BUFFERED (1)
) out_buffer (
.clk (clk),
.reset (reset),