From a3a7239b4d875d0d774f9108ae24dffe2d8a90e1 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 23 Jun 2021 01:51:23 -0700 Subject: [PATCH] critical path optimizations --- hw/rtl/VX_ibuffer.v | 37 +++----- hw/rtl/VX_scoreboard.v | 19 +--- hw/rtl/libs/VX_onehot_encoder.v | 144 +++++++++++++++++++++++++++-- hw/rtl/libs/VX_priority_encoder.v | 147 +++++++++++++++++++++++++++--- 4 files changed, 287 insertions(+), 60 deletions(-) diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index b94eead8..ca7524bb 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -97,6 +97,7 @@ module VX_ibuffer #( reg [DATAW-1:0] deq_instr, deq_instr_n; reg [NWARPSW-1:0] num_warps; + // calculate valid table always @(*) begin valid_table_n = valid_table; if (deq_fire) begin @@ -113,11 +114,10 @@ module VX_ibuffer #( deq_valid_n = 1; deq_wid_n = 'x; deq_instr_n = 'x; - for (integer i = 0; i < `NUM_WARPS; i++) begin + for (integer i = `NUM_WARPS-1; i >= 0; --i) begin if (schedule_table[i]) begin - deq_wid_n = `NW_BITS'(i); + deq_wid_n = `NW_BITS'(i); deq_instr_n = q_data_out[i]; - break; end end end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin @@ -130,16 +130,16 @@ module VX_ibuffer #( deq_instr_n = q_data_in; end end - - // do round-robin with multiple active warps + + // do round-robin scheduling with multiple active warps always @(*) begin - schedule_table_n = schedule_table; - for (integer i = 0; i < `NUM_WARPS; i++) begin - if (schedule_table[i]) begin - schedule_table_n[i] = 0; - break; - end + if (1 == $countones(schedule_table) + || (num_warps < 2)) begin + schedule_table_n = valid_table_n; + end else begin + schedule_table_n = schedule_table; end + schedule_table_n[deq_wid_n] = 0; end wire warp_added = enq_fire && q_empty[ibuf_enq_if.wid]; @@ -148,21 +148,12 @@ module VX_ibuffer #( always @(posedge clk) begin if (reset) begin valid_table <= 0; - schedule_table <= 0; deq_valid <= 0; num_warps <= 0; end else begin - valid_table <= valid_table_n; - - if (0 == (| schedule_table_n) - || (num_warps < 2)) begin - schedule_table <= valid_table_n; - schedule_table[deq_wid_n] <= 0; - end else begin - schedule_table <= schedule_table_n; - end - - deq_valid <= deq_valid_n; + valid_table <= valid_table_n; + deq_valid <= deq_valid_n; + schedule_table <= schedule_table_n; if (warp_added && !warp_removed) begin num_warps <= num_warps + NWARPSW'(1); diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index cca7c007..11055825 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -12,18 +12,11 @@ module VX_scoreboard #( ); reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs; - reg is_reg_busy; - always @(*) begin - is_reg_busy = 'x; - for (integer i = 0; i < `NUM_WARPS; ++i) begin - if (ibuf_deq_if.wid == `NW_BITS'(i)) begin - is_reg_busy = | (inuse_regs[i] & ibuf_deq_if.used_regs); - end - end - end - assign delay = is_reg_busy; - - wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && (ibuf_deq_if.wb != 0); + wire [`NUM_REGS-1:0] deq_inuse_regs = inuse_regs[ibuf_deq_if.wid]; + + assign delay = | (deq_inuse_regs & ibuf_deq_if.used_regs); + + wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && ibuf_deq_if.wb; wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop; @@ -43,8 +36,6 @@ module VX_scoreboard #( end end - wire [`NUM_REGS-1:0] deq_inuse_regs = inuse_regs[ibuf_deq_if.wid]; - `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin diff --git a/hw/rtl/libs/VX_onehot_encoder.v b/hw/rtl/libs/VX_onehot_encoder.v index b60b73e5..e9d71bef 100644 --- a/hw/rtl/libs/VX_onehot_encoder.v +++ b/hw/rtl/libs/VX_onehot_encoder.v @@ -4,8 +4,10 @@ // Adapter from BaseJump STL: http://bjump.org/data_out.html module VX_onehot_encoder #( - parameter N = 1, - parameter LN = `LOG2UP(N) + parameter N = 1, + parameter REVERSE = 0, + parameter FAST = 1, + parameter LN = `LOG2UP(N) ) ( input wire [N-1:0] data_in, output wire [LN-1:0] data_out, @@ -18,14 +20,24 @@ module VX_onehot_encoder #( end else if (N == 2) begin - assign data_out = data_in[1]; + assign data_out = data_in[!REVERSE]; assign valid = (| data_in); - end else begin - + end else if (N == 4) begin + reg [LN-1:0] index_r; - if (N == 4) begin + if (REVERSE) begin + always @(*) begin + casez (data_in) + 4'b1000: index_r = LN'(0); + 4'b?100: index_r = LN'(1); + 4'b??10: index_r = LN'(2); + 4'b???1: index_r = LN'(3); + default: index_r = 'x; + endcase + end + end else begin always @(*) begin casez (data_in) 4'b0001: index_r = LN'(0); @@ -35,7 +47,30 @@ module VX_onehot_encoder #( default: index_r = 'x; endcase end - end else if (N == 8) begin + end + + assign data_out = index_r; + assign valid = (| data_in); + + end else if (N == 8) begin + + reg [LN-1:0] index_r; + + if (REVERSE) begin + always @(*) begin + casez (data_in) + 8'b10000000: index_r = LN'(0); + 8'b?1000000: index_r = LN'(1); + 8'b??100000: index_r = LN'(2); + 8'b???10000: index_r = LN'(3); + 8'b????1000: index_r = LN'(4); + 8'b?????100: index_r = LN'(5); + 8'b??????10: index_r = LN'(6); + 8'b???????1: index_r = LN'(7); + default: index_r = 'x; + endcase + end + end else begin always @(*) begin casez (data_in) 8'b00000001: index_r = LN'(0); @@ -49,7 +84,38 @@ module VX_onehot_encoder #( default: index_r = 'x; endcase end - end else if (N == 16) begin + end + + assign data_out = index_r; + assign valid = (| data_in); + + end else if (N == 16) begin + + reg [LN-1:0] index_r; + + if (REVERSE) begin + always @(*) begin + casez (data_in) + 16'b1000000000000000: index_r = LN'(0); + 16'b?100000000000000: index_r = LN'(1); + 16'b??10000000000000: index_r = LN'(2); + 16'b???1000000000000: index_r = LN'(3); + 16'b????100000000000: index_r = LN'(4); + 16'b?????10000000000: index_r = LN'(5); + 16'b??????1000000000: index_r = LN'(6); + 16'b???????100000000: index_r = LN'(7); + 16'b????????10000000: index_r = LN'(8); + 16'b?????????1000000: index_r = LN'(9); + 16'b??????????100000: index_r = LN'(10); + 16'b???????????10000: index_r = LN'(11); + 16'b????????????1000: index_r = LN'(12); + 16'b?????????????100: index_r = LN'(13); + 16'b??????????????10: index_r = LN'(14); + 16'b???????????????1: index_r = LN'(15); + default: index_r = 'x; + endcase + end + end else begin always @(*) begin casez (data_in) 16'b0000000000000001: index_r = LN'(0); @@ -71,7 +137,66 @@ module VX_onehot_encoder #( default: index_r = 'x; endcase end - end else begin + end + + assign data_out = index_r; + assign valid = (| data_in); + + end if (FAST) begin + `IGNORE_WARNINGS_BEGIN + localparam levels_lp = $clog2(N); + localparam aligned_width_lp = 1 << $clog2(N); + + wire [levels_lp:0][aligned_width_lp-1:0] addr; + wire [levels_lp:0][aligned_width_lp-1:0] v; + + // base case, also handle padding for non-power of two inputs + assign v[0] = REVERSE ? (data_in << (aligned_width_lp - N)) : ((aligned_width_lp)'(data_in)); + assign addr[0] = 'x; + + for (genvar level = 1; level < levels_lp+1; level=level+1) begin + localparam segments_lp = 2**(levels_lp-level); + localparam segment_slot_lp = aligned_width_lp/segments_lp; + localparam segment_width_lp = level; // how many bits are needed at each level + + for (genvar segment = 0; segment < segments_lp; segment=segment+1) begin + wire [1:0] vs = { + v[level-1][segment*segment_slot_lp+(segment_slot_lp >> 1)], + v[level-1][segment*segment_slot_lp] + }; + + assign v[level][segment*segment_slot_lp] = (| vs); + + if (level == 1) begin + assign addr[level][(segment*segment_slot_lp)+:segment_width_lp] = vs[!REVERSE]; + end else begin + assign addr[level][(segment*segment_slot_lp)+:segment_width_lp] = { + vs[!REVERSE], + addr[level-1][segment*segment_slot_lp+:segment_width_lp-1] | addr[level-1][segment*segment_slot_lp+(segment_slot_lp >> 1)+:segment_width_lp-1] + }; + end + end + end + + assign data_out = addr[levels_lp][`LOG2UP(N)-1:0]; + assign valid = v[levels_lp][0]; + `IGNORE_WARNINGS_END + end else begin + + reg [LN-1:0] index_r; + + if (REVERSE) begin + + always @(*) begin + index_r = 'x; + for (integer i = N-1; i >= 0; --i) begin + if (data_in[i]) begin + index_r = `LOG2UP(N)'(i); + end + end + end + + end else begin always @(*) begin index_r = 'x; for (integer i = 0; i < N; i++) begin @@ -84,7 +209,6 @@ module VX_onehot_encoder #( assign data_out = index_r; assign valid = (| data_in); - end endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_priority_encoder.v b/hw/rtl/libs/VX_priority_encoder.v index a14c180f..32eae452 100644 --- a/hw/rtl/libs/VX_priority_encoder.v +++ b/hw/rtl/libs/VX_priority_encoder.v @@ -1,9 +1,10 @@ `include "VX_platform.vh" module VX_priority_encoder #( - parameter N = 1, - parameter FAST = 1, - parameter LN = `LOG2UP(N) + parameter N = 1, + parameter REVERSE = 0, + parameter FAST = 1, + parameter LN = `LOG2UP(N) ) ( input wire [N-1:0] data_in, output wire [N-1:0] onehot, @@ -19,16 +20,26 @@ module VX_priority_encoder #( end else if (N == 2) begin - assign onehot = {~data_in[0], data_in[0]}; - assign index = ~data_in[0]; + assign onehot = {~data_in[REVERSE], data_in[REVERSE]}; + assign index = ~data_in[REVERSE]; assign valid_out = (| data_in); - end else begin - + end else if (N == 4) begin + reg [LN-1:0] index_r; reg [N-1:0] onehot_r; - if (N == 4) begin + if (REVERSE) begin + always @(*) begin + casez (data_in) + 4'b1???: begin onehot_r = 4'b0001; index_r = LN'(0); end + 4'b01??: begin onehot_r = 4'b0010; index_r = LN'(1); end + 4'b001?: begin onehot_r = 4'b0100; index_r = LN'(2); end + 4'b0001: begin onehot_r = 4'b1000; index_r = LN'(3); end + default: begin onehot_r = 'x; index_r = 'x; end + endcase + end + end else begin always @(*) begin casez (data_in) 4'b???1: begin onehot_r = 4'b0001; index_r = LN'(0); end @@ -38,7 +49,31 @@ module VX_priority_encoder #( default: begin onehot_r = 'x; index_r = 'x; end endcase end - end else if (N == 8) begin + end + + assign index = index_r; + assign onehot = onehot_r; + + end else if (N == 8) begin + + reg [LN-1:0] index_r; + reg [N-1:0] onehot_r; + + if (REVERSE) begin + always @(*) begin + casez (data_in) + 8'b1???????: begin onehot_r = 8'b00000001; index_r = LN'(0); end + 8'b01??????: begin onehot_r = 8'b00000010; index_r = LN'(1); end + 8'b001?????: begin onehot_r = 8'b00000100; index_r = LN'(2); end + 8'b0001????: begin onehot_r = 8'b00001000; index_r = LN'(3); end + 8'b00001???: begin onehot_r = 8'b00010000; index_r = LN'(4); end + 8'b000001??: begin onehot_r = 8'b00100000; index_r = LN'(5); end + 8'b0000001?: begin onehot_r = 8'b01000000; index_r = LN'(6); end + 8'b00000001: begin onehot_r = 8'b10000000; index_r = LN'(7); end + default: begin onehot_r = 'x; index_r = 'x; end + endcase + end + end else begin always @(*) begin casez (data_in) 8'b???????1: begin onehot_r = 8'b00000001; index_r = LN'(0); end @@ -52,7 +87,39 @@ module VX_priority_encoder #( default: begin onehot_r = 'x; index_r = 'x; end endcase end - end else if (N == 16) begin + end + + assign index = index_r; + assign onehot = onehot_r; + + end else if (N == 16) begin + + reg [LN-1:0] index_r; + reg [N-1:0] onehot_r; + + if (REVERSE) begin + always @(*) begin + casez (data_in) + 16'b1???????????????: begin onehot_r = 16'b0000000000000001; index_r = LN'(0); end + 16'b01??????????????: begin onehot_r = 16'b0000000000000010; index_r = LN'(1); end + 16'b001?????????????: begin onehot_r = 16'b0000000000000100; index_r = LN'(2); end + 16'b0001????????????: begin onehot_r = 16'b0000000000001000; index_r = LN'(3); end + 16'b00001???????????: begin onehot_r = 16'b0000000000010000; index_r = LN'(4); end + 16'b000001??????????: begin onehot_r = 16'b0000000000100000; index_r = LN'(5); end + 16'b0000001?????????: begin onehot_r = 16'b0000000001000000; index_r = LN'(6); end + 16'b00000001????????: begin onehot_r = 16'b0000000010000000; index_r = LN'(7); end + 16'b000000001???????: begin onehot_r = 16'b0000000100000000; index_r = LN'(8); end + 16'b0000000001??????: begin onehot_r = 16'b0000001000000000; index_r = LN'(9); end + 16'b00000000001?????: begin onehot_r = 16'b0000010000000000; index_r = LN'(10); end + 16'b000000000001????: begin onehot_r = 16'b0000100000000000; index_r = LN'(11); end + 16'b0000000000001???: begin onehot_r = 16'b0001000000000000; index_r = LN'(12); end + 16'b00000000000001??: begin onehot_r = 16'b0010000000000000; index_r = LN'(13); end + 16'b000000000000001?: begin onehot_r = 16'b0100000000000000; index_r = LN'(14); end + 16'b0000000000000001: begin onehot_r = 16'b1000000000000000; index_r = LN'(15); end + default: begin onehot_r = 'x; index_r = 'x; end + endcase + end + end else begin always @(*) begin casez (data_in) 16'b???????????????1: begin onehot_r = 16'b0000000000000001; index_r = LN'(0); end @@ -74,6 +141,58 @@ module VX_priority_encoder #( default: begin onehot_r = 'x; index_r = 'x; end endcase end + end + + assign index = index_r; + assign onehot = onehot_r; + + end else if (FAST) begin + + wire [N-1:0] scan_lo; + + VX_scan #( + .N (N), + .OP (2), + .REVERSE (REVERSE) + ) scan ( + .data_in (data_in), + .data_out (scan_lo) + ); + + if (REVERSE) begin + assign onehot = scan_lo & {1'b1, (~scan_lo[N-1:1])}; + assign valid_out = scan_lo[0]; + end else begin + assign onehot = scan_lo & {(~scan_lo[N-2:0]), 1'b1}; + assign valid_out = scan_lo[N-1]; + end + + VX_onehot_encoder #( + .N (N), + .REVERSE (REVERSE) + ) onehot_encoder ( + .data_in (onehot), + .data_out (index), + `UNUSED_PIN (valid) + ); + + end else begin + + reg [LN-1:0] index_r; + reg [N-1:0] onehot_r; + + if (REVERSE) begin + always @(*) begin + index_r = 'x; + onehot_r = 'x; + for (integer i = 0; i < N; ++i) begin + if (data_in[i]) begin + index_r = LN'(i); + onehot_r = 0; + onehot_r[i] = 1'b1; + end + end + end end else begin always @(*) begin index_r = 'x; @@ -86,11 +205,13 @@ module VX_priority_encoder #( end end end - end + end assign index = index_r; - assign onehot = onehot_r; - assign valid_out = (| data_in); + assign onehot = onehot_r; + end + + assign valid_out = (| data_in); endmodule \ No newline at end of file