performance refactoring - rebalanced stream buffers accross the device to enforce output buffering rule at compoments boudaries, finally resolved block ram R/W collusion discrepencies,

This commit is contained in:
Blaise Tine
2020-12-19 02:45:06 -08:00
parent 29cd2f5dff
commit 4bbd7bf408
76 changed files with 1313 additions and 1098 deletions

View File

@@ -1,9 +1,10 @@
`include "VX_platform.vh"
module VX_cam_buffer #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter ADDRW = `LOG2UP(SIZE)
parameter DATAW = 1,
parameter SIZE = 1,
parameter FASTRAM = 0,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
@@ -63,8 +64,8 @@ module VX_cam_buffer #(
VX_dp_ram #(
.DATAW(DATAW),
.SIZE(SIZE),
.BUFFERED(0),
.RWCHECK(0)
.RWCHECK(1),
.FASTRAM(FASTRAM)
) data_table (
.clk(clk),
.waddr(write_addr),

View File

@@ -26,9 +26,7 @@ module VX_dp_ram #(
localparam DATA32W = DATAW / 32;
localparam BYTEEN32W = BYTEENW / 4;
//`ifndef QUARTUS
if (FASTRAM) begin
if (FASTRAM) begin
if (BUFFERED) begin
reg [DATAW-1:0] dout_r;
@@ -57,72 +55,36 @@ module VX_dp_ram #(
dout_r <= mem[raddr];
end
end
assign dout = dout_r;
end else begin
`UNUSED_VAR (rden)
if (RWCHECK) begin
if (BYTEENW > 1) begin
`USE_FAST_BRAM reg [DATA32W-1:0][3:0][7:0] mem [SIZE-1:0];
if (BYTEENW > 1) begin
`USE_FAST_BRAM reg [DATA32W-1:0][3:0][7:0] mem [SIZE-1:0];
always @(posedge clk) begin
if (wren) begin
for (integer j = 0; j < BYTEEN32W; j++) begin
for (integer i = 0; i < 4; i++) begin
if (byteen[j * 4 + i])
mem[waddr][j][i] <= din[j * 32 + i * 8 +: 8];
end
always @(posedge clk) begin
if (wren) begin
for (integer j = 0; j < BYTEEN32W; j++) begin
for (integer i = 0; i < 4; i++) begin
if (byteen[j * 4 + i])
mem[waddr][j][i] <= din[j * 32 + i * 8 +: 8];
end
end
end
assign dout = mem[raddr];
end else begin
`USE_FAST_BRAM reg [DATAW-1:0] mem [SIZE-1:0];
always @(posedge clk) begin
if (wren && byteen)
mem[waddr] <= din;
end
assign dout = mem[raddr];
end
assign dout = mem[raddr];
end else begin
`USE_FAST_BRAM reg [DATAW-1:0] mem [SIZE-1:0];
if (BYTEENW > 1) begin
`USE_FAST_BRAM `NO_RW_RAM_CHECK reg [DATA32W-1:0][3:0][7:0] mem [SIZE-1:0];
always @(posedge clk) begin
if (wren) begin
for (integer j = 0; j < BYTEEN32W; j++) begin
for (integer i = 0; i < 4; i++) begin
if (byteen[j * 4 + i])
mem[waddr][j][i] <= din[j * 32 + i * 8 +: 8];
end
end
end
end
assign dout = mem[raddr];
end else begin
`USE_FAST_BRAM `NO_RW_RAM_CHECK reg [DATAW-1:0] mem [SIZE-1:0];
always @(posedge clk) begin
if (wren && byteen)
mem[waddr] <= din;
end
assign dout = mem[raddr];
end
end
always @(posedge clk) begin
if (wren && byteen)
mem[waddr] <= din;
end
assign dout = mem[raddr];
end
end
end else begin
if (BUFFERED) begin
reg [DATAW-1:0] dout_r;
if (BYTEENW > 1) begin
@@ -150,14 +112,11 @@ module VX_dp_ram #(
dout_r <= mem[raddr];
end
end
assign dout = dout_r;
end else begin
`UNUSED_VAR (rden)
if (RWCHECK) begin
if (BYTEENW > 1) begin
reg [DATA32W-1:0][3:0][7:0] mem [SIZE-1:0];
@@ -208,96 +167,6 @@ module VX_dp_ram #(
end
end
end
/*`else
localparam OUTDATA_REG_B = BUFFERED ? "CLOCK0" : "UNREGISTERED";
localparam RAM_BLOCK_TYPE = FASTRAM ? "MLAB" : "AUTO";
if (RWCHECK) begin
altsyncram #(
.init_file (),
.operation_mode ("DUAL_PORT"),
.numwords_a (SIZE),
.numwords_b (SIZE),
.widthad_a (ADDRW),
.widthad_b (ADDRW),
.width_a (DATAW),
.width_b (DATAW),
.width_byteena_a(BYTEENW),
.address_reg_b ("CLOCK0"),
.outdata_reg_b (OUTDATA_REG_B),
.ram_block_type (RAM_BLOCK_TYPE)
) mem (
.clocken0 (1'b1),
.clocken1 (),
.clocken2 (),
.clocken3 (),
.clock0 (clk),
.clock1 (),
.address_a (waddr),
.address_b (raddr),
.byteena_a (byteen),
.byteena_b (1'b1),
.wren_a (wren),
.wren_b (1'b0),
.data_a (din),
.data_b (),
.rden_a (),
.rden_b (1'b1),
.q_a (),
.q_b (dout),
.addressstall_a (1'b0),
.addressstall_b (1'b0),
.aclr0 (1'b0),
.aclr1 (1'b0),
.eccstatus ()
);
end else begin
`NO_RW_RAM_CHECK altsyncram #(
.init_file (),
.operation_mode ("DUAL_PORT"),
.numwords_a (SIZE),
.numwords_b (SIZE),
.widthad_a (ADDRW),
.widthad_b (ADDRW),
.width_a (DATAW),
.width_b (DATAW),
.width_byteena_a(BYTEENW),
.outdata_reg_b (OUTDATA_REG_B),
.ram_block_type (RAM_BLOCK_TYPE)
) mem (
.clocken0 (1'b1),
.clocken1 (1'b1),
.clocken2 (1'b1),
.clocken3 (1'b1),
.clock0 (clk),
.clock1 (clk),
.address_a (waddr),
.address_b (raddr),
.byteena_a (byteen),
.byteena_b (1'b1),
.wren_a (wren),
.wren_b (1'b0),
.data_a (din),
.data_b (),
.rden_a (),
.rden_b (1'b1),
.q_a (),
.q_b (dout),
.addressstall_a (1'b0),
.addressstall_b (1'b0),
.aclr0 (1'b0),
.aclr1 (1'b0),
.eccstatus ()
);
end
`endif*/
endmodule
`TRACING_ON

View File

@@ -3,10 +3,10 @@
module VX_generic_queue #(
parameter DATAW = 1,
parameter SIZE = 2,
parameter BUFFERED = 0,
parameter ADDRW = $clog2(SIZE),
parameter SIZEW = $clog2(SIZE+1),
parameter FASTRAM = 0
parameter BUFFERED = 0,
parameter FASTRAM = 1
) (
input wire clk,
input wire reset,
@@ -78,25 +78,22 @@ module VX_generic_queue #(
end;
end
end
used_r <= used_r + ADDRW'(push) - ADDRW'(pop);
used_r <= used_r + (ADDRW'(push) - ADDRW'(pop));
end
end
if (0 == BUFFERED) begin
reg [ADDRW:0] rd_ptr_r;
reg [ADDRW:0] wr_ptr_r;
wire [ADDRW-1:0] rd_ptr_a = rd_ptr_r[ADDRW-1:0];
wire [ADDRW-1:0] wr_ptr_a = wr_ptr_r[ADDRW-1:0];
reg [ADDRW-1:0] rd_ptr_r;
reg [ADDRW-1:0] wr_ptr_r;
always @(posedge clk) begin
if (reset) begin
rd_ptr_r <= 0;
wr_ptr_r <= 0;
end else begin
wr_ptr_r <= wr_ptr_r + (ADDRW+1)'(push);
rd_ptr_r <= rd_ptr_r + (ADDRW+1)'(pop);
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
rd_ptr_r <= rd_ptr_r + ADDRW'(pop);
end
end
@@ -108,8 +105,8 @@ module VX_generic_queue #(
.FASTRAM(FASTRAM)
) dp_ram (
.clk(clk),
.waddr(wr_ptr_a),
.raddr(rd_ptr_a),
.waddr(wr_ptr_r),
.raddr(rd_ptr_r),
.wren(push),
.byteen(1'b1),
.rden(1'b1),
@@ -149,7 +146,7 @@ module VX_generic_queue #(
.DATAW(DATAW),
.SIZE(SIZE),
.BUFFERED(0),
.RWCHECK(0),
.RWCHECK(1),
.FASTRAM(FASTRAM)
) dp_ram (
.clk(clk),
@@ -166,7 +163,7 @@ module VX_generic_queue #(
if (push && (empty_r || ((used_r == ADDRW'(1)) && pop))) begin
dout_r <= data_in;
end else if (pop) begin
dout_r <= dout;
dout_r <= dout; // BRAM R/W collision
end
end
@@ -178,4 +175,4 @@ module VX_generic_queue #(
assign size = {full_r, used_r};
end
endmodule
endmodule

View File

@@ -143,7 +143,7 @@ module VX_scope #(
end
if (stop
|| (waddr >= waddr_end)) begin
|| (waddr == waddr_end)) begin
waddr <= waddr; // keep last address
recording <= 0;
data_valid <= 1;

View File

@@ -1,11 +1,10 @@
`include "VX_platform.vh"
module VX_stream_arbiter #(
parameter NUM_REQS = 1,
parameter DATAW = 1,
parameter TYPE = "R",
parameter IN_BUFFER = 0,
parameter OUT_BUFFER = 0
parameter NUM_REQS = 1,
parameter DATAW = 1,
parameter TYPE = "R",
parameter BUFFERED = 0
) (
input wire clk,
input wire reset,
@@ -22,27 +21,6 @@ module VX_stream_arbiter #(
localparam LOG_NUM_REQS = $clog2(NUM_REQS);
if (NUM_REQS > 1) begin
wire [NUM_REQS-1:0] valid_in_qual;
wire [NUM_REQS-1:0][DATAW-1:0] data_in_qual;
wire [NUM_REQS-1:0] ready_in_qual;
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_skid_buffer #(
.DATAW (DATAW),
.PASSTHRU (!IN_BUFFER)
) req_buffer (
.clk (clk),
.reset (reset),
.valid_in (valid_in[i]),
.data_in (data_in[i]),
.ready_in (ready_in[i]),
.valid_out (valid_in_qual[i]),
.data_out (data_in_qual[i]),
.ready_out (ready_in_qual[i])
);
end
wire sel_enable;
wire sel_valid;
wire [LOG_NUM_REQS-1:0] sel_idx;
@@ -56,7 +34,7 @@ module VX_stream_arbiter #(
) sel_arb (
.clk (clk),
.reset (reset),
.requests (valid_in_qual),
.requests (valid_in),
.enable (sel_enable),
.grant_valid (sel_valid),
.grant_index (sel_idx),
@@ -71,7 +49,7 @@ module VX_stream_arbiter #(
) sel_arb (
.clk (clk),
.reset (reset),
.requests (valid_in_qual),
.requests (valid_in),
.enable (sel_enable),
.grant_valid (sel_valid),
.grant_index (sel_idx),
@@ -86,7 +64,7 @@ module VX_stream_arbiter #(
) sel_arb (
.clk (clk),
.reset (reset),
.requests (valid_in_qual),
.requests (valid_in),
.enable (sel_enable),
.grant_valid (sel_valid),
.grant_index (sel_idx),
@@ -101,47 +79,36 @@ module VX_stream_arbiter #(
) sel_arb (
.clk (clk),
.reset (reset),
.requests (valid_in_qual),
.requests (valid_in),
.enable (sel_enable),
.grant_valid (sel_valid),
.grant_index (sel_idx),
.grant_onehot (sel_1hot)
);
end
end
if (OUT_BUFFER) begin
wire ready_out_unqual;
wire stall = ~ready_out && valid_out;
assign sel_enable = ~stall;
VX_skid_buffer #(
.DATAW (DATAW),
.PASSTHRU (!BUFFERED)
) out_buffer (
.clk (clk),
.reset (reset),
.valid_in (sel_valid),
.data_in (data_in[sel_idx]),
.ready_in (ready_out_unqual),
.valid_out (valid_out),
.data_out (data_out),
.ready_out (ready_out)
);
VX_generic_register #(
.N(1 + DATAW),
.R(1)
) pipe_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (1'b0),
.data_in ({sel_valid, data_in_qual[sel_idx]}),
.data_out ({valid_out, data_out})
);
assign sel_enable = ready_out_unqual;
for (genvar i = 0; i < NUM_REQS; i++) begin
assign ready_in_qual[i] = sel_1hot[i] && ~stall;
end
end else begin
assign sel_enable = ready_out;
assign valid_out = sel_valid;
assign data_out = data_in_qual[sel_idx];
for (genvar i = 0; i < NUM_REQS; i++) begin
assign ready_in_qual[i] = sel_1hot[i] && ready_out;
end
end
for (genvar i = 0; i < NUM_REQS; i++) begin
assign ready_in[i] = sel_1hot[i] && ready_out_unqual;
end
end else begin

View File

@@ -0,0 +1,68 @@
`include "VX_platform.vh"
module VX_stream_demux #(
parameter NUM_REQS = 1,
parameter DATAW = 1,
parameter BUFFERED = 0,
localparam LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
input wire clk,
input wire reset,
input wire [LOG_NUM_REQS-1:0] sel,
input wire valid_in,
input wire [DATAW-1:0] data_in,
output wire ready_in,
output wire [NUM_REQS-1:0] valid_out,
output wire [NUM_REQS-1:0][DATAW-1:0] data_out,
input wire [NUM_REQS-1:0] ready_out
);
if (NUM_REQS > 1) begin
reg [NUM_REQS-1:0] valid_out_unqual;
wire [NUM_REQS-1:0][DATAW-1:0] data_out_unqual;
wire [NUM_REQS-1:0] ready_out_unqual;
always @(*) begin
valid_out_unqual = '0;
valid_out_unqual[sel] = valid_in;
end
for (genvar i = 0; i < NUM_REQS; i++) begin
assign data_out_unqual[i] = data_in;
end
assign ready_in = ready_out_unqual[sel];
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (DATAW),
.PASSTHRU (!BUFFERED)
) out_buffer (
.clk (clk),
.reset (reset),
.valid_in (valid_out_unqual[i]),
.data_in (data_out_unqual[i]),
.ready_in (ready_out_unqual[i]),
.valid_out (valid_out[i]),
.data_out (data_out[i]),
.ready_out (ready_out[i])
);
end
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (sel)
assign valid_out = valid_in;
assign data_out = data_in;
assign ready_in = ready_out;
end
endmodule