From 3d052e9428372e2ccd70edb24b80b80ffe42157c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 8 Sep 2021 02:26:39 -0700 Subject: [PATCH] fmax optimization bundle (250 MHz). --- hw/rtl/VX_decode.v | 26 ++++++++++----------- hw/rtl/VX_ibuffer.v | 39 +++++++++++++++++-------------- hw/rtl/VX_instr_demux.v | 15 ++++-------- hw/rtl/VX_mem_unit.v | 4 ++-- hw/rtl/VX_scoreboard.v | 26 ++++++++++++++------- hw/rtl/fp_cores/VX_fp_cvt.v | 5 ++-- hw/rtl/fp_cores/VX_fp_ncomp.v | 2 +- hw/rtl/interfaces/VX_decode_if.v | 7 +++--- hw/rtl/interfaces/VX_ibuffer_if.v | 12 ++++++---- 9 files changed, 71 insertions(+), 65 deletions(-) diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 3ba2d9c6..ee810959 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -2,15 +2,14 @@ `include "VX_print_instr.vh" `ifdef EXT_F_ENABLE - `define USED_IREG(r) \ - used_regs[{1'b0, r}] = 1 + `define USED_IREG(r) \ + r``_r = {1'b0, ``r} - `define USED_FREG(r) \ - r``_r[5] = 1; \ - used_regs[{1'b1, r}] = 1 + `define USED_FREG(r) \ + r``_r = {1'b1, ``r} `else `define USED_IREG(r) \ - used_regs[r] = 1 + r``_r = ``r `endif module VX_decode #( @@ -38,7 +37,6 @@ module VX_decode #( reg [31:0] imm; reg use_rd, use_PC, use_imm; reg is_join, is_wstall; - reg [`NUM_REGS-1:0] used_regs; wire [31:0] instr = ifetch_rsp_if.data; wire [6:0] opcode = instr[6:0]; @@ -57,23 +55,24 @@ module VX_decode #( wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0}; wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0}; wire [11:0] jalr_imm = {func7, rs2}; + + `UNUSED_VAR (rs3) always @(*) begin ex_type = 0; op_type = 'x; op_mod = 0; - rd_r = `NR_BITS'(rd); - rs1_r = `NR_BITS'(rs1); - rs2_r = `NR_BITS'(rs2); - rs3_r = `NR_BITS'(rs3); + rd_r = 0; + rs1_r = 0; + rs2_r = 0; + rs3_r = 0; imm = 'x; use_imm = 0; use_PC = 0; use_rd = 0; is_join = 0; is_wstall = 0; - used_regs = 0; case (opcode) `INST_I: begin @@ -399,7 +398,6 @@ module VX_decode #( assign decode_if.imm = imm; assign decode_if.use_PC = use_PC; assign decode_if.use_imm = use_imm; - assign decode_if.used_regs = used_regs; /////////////////////////////////////////////////////////////////////////// @@ -421,7 +419,7 @@ module VX_decode #( print_ex_type(decode_if.ex_type); dpi_trace(", op="); print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod); - dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_regs=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.used_regs); + dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm); end end `endif diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 5deb5e72..358a6d2e 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -10,12 +10,12 @@ module VX_ibuffer #( VX_decode_if decode_if, // outputs - VX_ibuffer_if ibuffer_if + VX_ibuffer_if ibuffer_if ); `UNUSED_PARAM (CORE_ID) - localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS; + localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; localparam ADDRW = $clog2(`IBUF_SIZE+1); localparam NWARPSW = $clog2(`NUM_WARPS+1); @@ -35,16 +35,16 @@ module VX_ibuffer #( wire writing = enq_fire && (i == decode_if.wid); wire reading = deq_fire && (i == ibuffer_if.wid); - wire is_head_ptr = empty_r[i] || (alm_empty_r[i] && reading); + wire going_empty = empty_r[i] || (alm_empty_r[i] && reading); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`IBUF_SIZE), - .OUTPUT_REG (`IBUF_SIZE > 2) + .OUTPUT_REG (1) ) queue ( .clk (clk), .reset (reset), - .valid_in (writing && !is_head_ptr), + .valid_in (writing && !going_empty), .data_in (q_data_in), .ready_out(reading), .data_out (q_data_prev[i]), @@ -77,7 +77,7 @@ module VX_ibuffer #( used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading))); end - if (writing && is_head_ptr) begin + if (writing && going_empty) begin q_data_out[i] <= q_data_in; end else if (reading) begin q_data_out[i] <= q_data_prev[i]; @@ -173,15 +173,14 @@ module VX_ibuffer #( decode_if.ex_type, decode_if.op_type, decode_if.op_mod, - decode_if.wb, + decode_if.wb, + decode_if.use_PC, + decode_if.use_imm, + decode_if.imm, decode_if.rd, decode_if.rs1, decode_if.rs2, - decode_if.rs3, - decode_if.imm, - decode_if.use_PC, - decode_if.use_imm, - decode_if.used_regs}; + decode_if.rs3}; assign ibuffer_if.valid = deq_valid; assign ibuffer_if.wid = deq_wid; @@ -190,16 +189,20 @@ module VX_ibuffer #( ibuffer_if.ex_type, ibuffer_if.op_type, ibuffer_if.op_mod, - ibuffer_if.wb, + ibuffer_if.wb, + ibuffer_if.use_PC, + ibuffer_if.use_imm, + ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.rs1, ibuffer_if.rs2, - ibuffer_if.rs3, - ibuffer_if.imm, - ibuffer_if.use_PC, - ibuffer_if.use_imm} = deq_instr[DATAW-1:`NUM_REGS]; + ibuffer_if.rs3} = deq_instr; - assign ibuffer_if.used_regs_n = deq_instr_n[`NUM_REGS-1:0]; + // scoreboard forwarding assign ibuffer_if.wid_n = deq_wid_n; + assign ibuffer_if.rd_n = deq_instr_n[3*`NR_BITS +: `NR_BITS]; + assign ibuffer_if.rs1_n = deq_instr_n[2*`NR_BITS +: `NR_BITS]; + assign ibuffer_if.rs2_n = deq_instr_n[1*`NR_BITS +: `NR_BITS]; + assign ibuffer_if.rs3_n = deq_instr_n[0*`NR_BITS +: `NR_BITS]; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 91342ae4..19ff353b 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -42,8 +42,7 @@ module VX_instr_demux ( wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), - .OUTPUT_REG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)) ) alu_buffer ( .clk (clk), .reset (reset), @@ -62,8 +61,7 @@ module VX_instr_demux ( wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)), - .OUTPUT_REG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)) ) lsu_buffer ( .clk (clk), .reset (reset), @@ -84,8 +82,7 @@ module VX_instr_demux ( wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), - .OUTPUT_REG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32) ) csr_buffer ( .clk (clk), .reset (reset), @@ -104,8 +101,7 @@ module VX_instr_demux ( wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), - .OUTPUT_REG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) ) fpu_buffer ( .clk (clk), .reset (reset), @@ -127,8 +123,7 @@ module VX_instr_demux ( wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)), - .OUTPUT_REG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)) ) gpu_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index a889216e..06065d5d 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -207,7 +207,7 @@ module VX_mem_unit # ( .DATA_SIZE (4), .TAG_IN_WIDTH (`DCORE_TAG_WIDTH), .TYPE ("P"), - .BUFFERED_REQ (2), + .BUFFERED_REQ (1), .BUFFERED_RSP (1) ) smem_arb ( .clk (clk), @@ -319,7 +319,7 @@ module VX_mem_unit # ( .TYPE ("R"), .TAG_SEL_IDX (1), // Skip 0 for NC flag .BUFFERED_REQ (1), - .BUFFERED_RSP (2) + .BUFFERED_RSP (1) ) mem_arb ( .clk (clk), .reset (mem_arb_reset), diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 94304c49..d523a2b2 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -12,10 +12,6 @@ module VX_scoreboard #( ); reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n; - reg [`NUM_REGS-1:0] deq_inuse_regs; - - assign delay = (| deq_inuse_regs); - wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb; wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop; @@ -27,7 +23,7 @@ module VX_scoreboard #( end if (release_reg) begin inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0; - end + end end always @(posedge clk) begin @@ -36,8 +32,20 @@ module VX_scoreboard #( end else begin inuse_regs <= inuse_regs_n; end - deq_inuse_regs <= inuse_regs_n[ibuffer_if.wid_n] & ibuffer_if.used_regs_n; end + + reg deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3; + + always @(posedge clk) begin + deq_inuse_rd <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rd_n]; + deq_inuse_rs1 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs1_n]; + deq_inuse_rs2 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs2_n]; + deq_inuse_rs3 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs3_n]; + end + + assign delay = deq_inuse_rd | deq_inuse_rs1 | deq_inuse_rs2 | deq_inuse_rs3; + + `UNUSED_VAR (writeback_if.PC) reg [31:0] deadlock_ctr; wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); @@ -46,10 +54,10 @@ module VX_scoreboard #( deadlock_ctr <= 0; end else begin `ifdef DBG_PRINT_PIPELINE - if (ibuffer_if.valid && ~ibuffer_if.ready) begin + if (ibuffer_if.valid && ~ibuffer_if.ready) begin dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, - deq_inuse_regs[ibuffer_if.rd], deq_inuse_regs[ibuffer_if.rs1], deq_inuse_regs[ibuffer_if.rs2], deq_inuse_regs[ibuffer_if.rs3]); + deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3); end `endif if (release_reg) begin @@ -61,7 +69,7 @@ module VX_scoreboard #( deadlock_ctr <= deadlock_ctr + 1; assert(deadlock_ctr < deadlock_timeout) else $error("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, - deq_inuse_regs[ibuffer_if.rd], deq_inuse_regs[ibuffer_if.rs1], deq_inuse_regs[ibuffer_if.rs2], deq_inuse_regs[ibuffer_if.rs3]); + deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3); end else if (ibuffer_if.valid && ibuffer_if.ready) begin deadlock_ctr <= 0; end diff --git a/hw/rtl/fp_cores/VX_fp_cvt.v b/hw/rtl/fp_cores/VX_fp_cvt.v index 98cdded2..2362f65c 100644 --- a/hw/rtl/fp_cores/VX_fp_cvt.v +++ b/hw/rtl/fp_cores/VX_fp_cvt.v @@ -85,8 +85,7 @@ module VX_fp_cvt #( assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i]; assign fmt_mantissa = INT_MAN_WIDTH'({fp_clss[i].is_normal, dataa[i][MAN_BITS-1:0]}); assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + - {1'b0, fp_clss[i].is_subnormal} + - (FMT_SHIFT_COMPENSATION - EXP_BIAS); + {1'b0, fp_clss[i].is_subnormal}; assign encoded_mant[i] = is_itof ? int_mantissa : fmt_mantissa; assign input_sign[i] = is_itof ? int_sign : fmt_sign; `IGNORE_WARNINGS_END @@ -144,7 +143,7 @@ module VX_fp_cvt #( assign input_mant_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i]; // Unbias exponent and compensate for shift - wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] - {1'b0, renorm_shamt_s0[i]}; + wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] + (FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]}; wire [INT_EXP_WIDTH-1:0] int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]}; assign input_exp_s0[i] = is_itof_s0 ? int_input_exp : fp_input_exp; diff --git a/hw/rtl/fp_cores/VX_fp_ncomp.v b/hw/rtl/fp_cores/VX_fp_ncomp.v index df6c6b38..49f0da77 100644 --- a/hw/rtl/fp_cores/VX_fp_ncomp.v +++ b/hw/rtl/fp_cores/VX_fp_ncomp.v @@ -100,7 +100,7 @@ module VX_fp_ncomp #( VX_pipe_register #( .DATAW (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_class_t) + 1 + 1)), .RESETW (1), - .DEPTH (0) + .DEPTH (1) ) pipe_reg0 ( .clk (clk), .reset (reset), diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index e92f4592..70ec9fcd 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -13,14 +13,13 @@ interface VX_decode_if (); wire [`INST_OP_BITS-1:0] op_type; wire [`INST_MOD_BITS-1:0] op_mod; wire wb; + wire use_PC; + wire use_imm; + wire [31:0] imm; wire [`NR_BITS-1:0] rd; wire [`NR_BITS-1:0] rs1; wire [`NR_BITS-1:0] rs2; wire [`NR_BITS-1:0] rs3; - wire [31:0] imm; - wire use_PC; - wire use_imm; - wire [`NUM_REGS-1:0] used_regs; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_ibuffer_if.v b/hw/rtl/interfaces/VX_ibuffer_if.v index 87b2c15b..c0d0f465 100644 --- a/hw/rtl/interfaces/VX_ibuffer_if.v +++ b/hw/rtl/interfaces/VX_ibuffer_if.v @@ -13,16 +13,20 @@ interface VX_ibuffer_if (); wire [`INST_OP_BITS-1:0] op_type; wire [`INST_MOD_BITS-1:0] op_mod; wire wb; + wire use_PC; + wire use_imm; + wire [31:0] imm; wire [`NR_BITS-1:0] rd; wire [`NR_BITS-1:0] rs1; wire [`NR_BITS-1:0] rs2; wire [`NR_BITS-1:0] rs3; - wire [31:0] imm; - wire use_PC; - wire use_imm; wire ready; - wire [`NUM_REGS-1:0] used_regs_n; + // scoreboard forwarding + wire [`NR_BITS-1:0] rd_n; + wire [`NR_BITS-1:0] rs1_n; + wire [`NR_BITS-1:0] rs2_n; + wire [`NR_BITS-1:0] rs3_n; wire [`NW_BITS-1:0] wid_n; endinterface