From ee81e81818069511769b7f99b7a24c39bfa4f1e4 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 25 Aug 2020 02:29:27 -0700 Subject: [PATCH] adding using serial divider to save area cost --- hw/opae/README | 6 +- hw/rtl/VX_csr_data.v | 45 +++++++------ hw/rtl/VX_csr_unit.v | 9 +-- hw/rtl/VX_decode.v | 2 +- hw/rtl/VX_define.vh | 1 - hw/rtl/VX_ibuffer.v | 55 +++++++++------- hw/rtl/VX_mul_unit.v | 113 +++++++++++-------------------- hw/rtl/VX_warp_sched.v | 7 +- hw/rtl/fp_cores/VX_fp_fpga.v | 12 ++-- hw/rtl/libs/VX_serial_div.v | 124 +++++++++++++++++++++++++++++++++++ 10 files changed, 239 insertions(+), 135 deletions(-) create mode 100644 hw/rtl/libs/VX_serial_div.v diff --git a/hw/opae/README b/hw/opae/README index 86c1759a..3596f509 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -102,4 +102,8 @@ make -C top clean && make -C top > top/build.log 2>&1 & # How to calculate the maximum operating frequency? 200 Mhz -> period = 1/200x10^6 = 5ns -if slack = +1.664 -> minimal period = 5-1.664 = 3.336 -> fmax = 1/3.336 = 300 Mhz \ No newline at end of file +if slack = +1.664 -> minimal period = 5-1.664 = 3.336 -> fmax = 1/3.336 = 300 Mhz + + +# build rtlsim from driver tests +make -C ../../rtlsim clean && reset && make -C ../../rtlsim \ No newline at end of file diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index b7f87b47..5ee0831e 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -7,16 +7,16 @@ module VX_csr_data #( input wire reset, VX_cmt_to_csr_if cmt_to_csr_if, - VX_csr_to_issue_if csr_to_issue_if, - - input wire[`NW_BITS-1:0] wid, + VX_csr_to_issue_if csr_to_issue_if, input wire read_enable, input wire[`CSR_ADDR_BITS-1:0] read_addr, + input wire[`NW_BITS-1:0] read_wid, output wire[31:0] read_data, input wire write_enable, input wire[`CSR_ADDR_BITS-1:0] write_addr, + input wire[`NW_BITS-1:0] write_wid, input wire[`CSR_WIDTH-1:0] write_data ); reg [`CSR_WIDTH-1:0] csr_satp; @@ -33,7 +33,7 @@ module VX_csr_data #( reg [`FFG_BITS-1:0] csr_fflags [`NUM_WARPS-1:0]; reg [`FRM_BITS-1:0] csr_frm [`NUM_WARPS-1:0]; - reg [`FRM_BITS+`FFG_BITS-1:0] csr_fcsr [`NUM_WARPS-1:0]; // fflags + frm + reg [`FRM_BITS+`FFG_BITS-1:0] csr_fcsr [`NUM_WARPS-1:0]; // fflags + frm reg [31:0] read_data_r; @@ -46,29 +46,32 @@ module VX_csr_data #( if (write_enable) begin case (write_addr) `CSR_FFLAGS: begin - csr_fcsr[wid][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; - csr_fflags[wid] <= write_data[`FFG_BITS-1:0]; + csr_fcsr[write_wid][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; + csr_fflags[write_wid] <= write_data[`FFG_BITS-1:0]; end + `CSR_FRM: begin - csr_fcsr[wid][`FFG_BITS+`FRM_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; - csr_frm[wid] <= write_data[`FRM_BITS-1:0]; + csr_fcsr[write_wid][`FFG_BITS+`FRM_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; + csr_frm[write_wid] <= write_data[`FRM_BITS-1:0]; end + `CSR_FCSR: begin - csr_fcsr[wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; - csr_frm[wid] <= write_data[`FFG_BITS+`FRM_BITS-1:`FFG_BITS]; - csr_fflags[wid] <= write_data[`FFG_BITS-1:0]; + csr_fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; + csr_frm[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:`FFG_BITS]; + csr_fflags[write_wid] <= write_data[`FFG_BITS-1:0]; end - `CSR_SATP: csr_satp <= write_data; + + `CSR_SATP: csr_satp <= write_data; `CSR_MSTATUS: csr_mstatus <= write_data; `CSR_MEDELEG: csr_medeleg <= write_data; `CSR_MIDELEG: csr_mideleg <= write_data; - `CSR_MIE: csr_mie <= write_data; - `CSR_MTVEC: csr_mtvec <= write_data; + `CSR_MIE: csr_mie <= write_data; + `CSR_MTVEC: csr_mtvec <= write_data; - `CSR_MEPC: csr_mepc <= write_data; + `CSR_MEPC: csr_mepc <= write_data; - `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data; + `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data; `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data; default: begin @@ -93,15 +96,15 @@ module VX_csr_data #( always @(*) begin read_data_r = 'x; case (read_addr) - `CSR_FFLAGS : read_data_r = 32'(csr_fflags[wid]); - `CSR_FRM : read_data_r = 32'(csr_frm[wid]); - `CSR_FCSR : read_data_r = 32'(csr_fcsr[wid]); + `CSR_FFLAGS : read_data_r = 32'(csr_fflags[read_wid]); + `CSR_FRM : read_data_r = 32'(csr_frm[read_wid]); + `CSR_FCSR : read_data_r = 32'(csr_fcsr[read_wid]); - `CSR_LWID : read_data_r = 32'(wid); + `CSR_LWID : read_data_r = 32'(read_wid); `CSR_LTID , `CSR_GTID , `CSR_MHARTID , - `CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(wid); + `CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid); `CSR_GCID : read_data_r = CORE_ID; `CSR_NT : read_data_r = `NUM_THREADS; `CSR_NW : read_data_r = `NUM_WARPS; diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 5b1c39c0..7b4bfa2c 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -45,14 +45,15 @@ module VX_csr_unit #( .clk (clk), .reset (reset), .cmt_to_csr_if (cmt_to_csr_if), - .csr_to_issue_if (csr_to_issue_if), + .csr_to_issue_if(csr_to_issue_if), .read_enable (csr_pipe_req_if.valid), .read_addr (csr_pipe_req_if.csr_addr), + .read_wid (csr_pipe_req_if.wid), .read_data (csr_read_data), - .write_enable (csr_we_s1), - .write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]), + .write_enable (csr_we_s1), .write_addr (csr_addr_s1), - .wid (csr_pipe_req_if.wid) + .write_wid (csr_pipe_rsp_if.wid), + .write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]) ); wire csr_hazard = (csr_addr_s1 == csr_pipe_req_if.csr_addr) diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 5b53129d..07ac9b0b 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -393,7 +393,7 @@ module VX_decode #( print_ex_type(decode_if.ex_type); $write(", op="); print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod); - $write("mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.thread_mask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm); + $write(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.thread_mask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm); end end `endif diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index c7f42fb6..5bf9733e 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -35,7 +35,6 @@ /////////////////////////////////////////////////////////////////////////////// -`define LATENCY_IDIV 33 `define LATENCY_IMUL 3 `define LATENCY_FDIV 16 diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index cd3233c3..04eebd6e 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -70,7 +70,7 @@ module VX_ibuffer #( end end - assign q_data_prev[i] = (wr_ptr_r != rd_ptr_r) ? entries[i][rd_ptr_a] : q_data_in; + assign q_data_prev[i] = entries[i][rd_ptr_a]; assign q_full[i] = (size_r[i] == SIZE); assign q_size[i] = size_r[i]; end @@ -83,31 +83,38 @@ module VX_ibuffer #( reg [`NW_BITS-1:0] deq_wid, deq_wid_n; reg deq_valid, deq_valid_n; reg [DATAW-1:0] deq_instr, deq_instr_n; + reg deq_is_size1, deq_is_size1_n; always @(*) begin valid_table_n = valid_table; - if (deq_fire) begin - valid_table_n[ibuf_deq_if.wid] = (q_size[ibuf_deq_if.wid] != 1); + if (deq_fire && deq_is_size1) begin + valid_table_n[ibuf_deq_if.wid] = 0; end if (enq_fire) begin valid_table_n[ibuf_enq_if.wid] = 1; end end - always @(*) begin - deq_wid_n = 0; - deq_valid_n = 0; - deq_instr_n = 'x; - schedule_table_n = schedule_table; - if (deq_fire) begin - schedule_table_n[ibuf_deq_if.wid] = (q_size[ibuf_deq_if.wid] != 1); + always @(*) begin + deq_valid_n = 0; + deq_wid_n = 'x; + deq_instr_n = 'x; + deq_is_size1_n = 'x; + + schedule_table_n = schedule_table; + if (deq_fire && deq_is_size1) begin + schedule_table_n[ibuf_deq_if.wid] = 0; end + for (integer i = 0; i < `NUM_WARPS; i++) begin - if (schedule_table_n[i]) begin - deq_wid_n = `NW_BITS'(i); - deq_valid_n = 1; - deq_instr_n = (deq_fire && (ibuf_deq_if.wid == `NW_BITS'(i))) ? q_data_prev[i] : q_data_out[i]; - schedule_table_n[i] = 0; + if (schedule_table_n[i]) begin + deq_valid_n = 1; + deq_wid_n = `NW_BITS'(i); + deq_instr_n = (deq_fire && (ibuf_deq_if.wid == `NW_BITS'(i))) ? q_data_prev[i] : q_data_out[i]; + deq_is_size1_n = (~(enq_fire && ibuf_enq_if.wid == `NW_BITS'(i)) + && (((deq_fire && ibuf_deq_if.wid == `NW_BITS'(i)) && (SIZEW'(2) == q_size[i])) + || (SIZEW'(1) == q_size[i]))); + schedule_table_n[i] = 0; break; end end @@ -123,17 +130,19 @@ module VX_ibuffer #( deq_valid <= 0; num_warps <= 0; end else begin - valid_table <= valid_table_n; - schedule_table <= (| schedule_table_n) ? schedule_table_n : valid_table_n; + valid_table <= valid_table_n; + schedule_table <= (| schedule_table_n) ? schedule_table_n : valid_table_n; if (enq_fire && (0 == num_warps)) begin - deq_valid <= 1; - deq_wid <= ibuf_enq_if.wid; - deq_instr <= q_data_in; + deq_valid <= 1; + deq_wid <= ibuf_enq_if.wid; + deq_instr <= q_data_in; + deq_is_size1 <= 1; end else if (!freeze) begin - deq_valid <= deq_valid_n; - deq_wid <= deq_wid_n; - deq_instr <= deq_instr_n; + deq_valid <= deq_valid_n; + deq_wid <= deq_wid_n; + deq_instr <= deq_instr_n; + deq_is_size1 <= deq_is_size1_n; end if (warp_added && !warp_removed) begin diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 4354e069..770d92e7 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -15,6 +15,7 @@ module VX_mul_unit #( localparam MULQ_BITS = `LOG2UP(`MULQ_SIZE); wire [`MUL_BITS-1:0] alu_op = mul_req_if.op_type; + wire is_div_op = `IS_DIV_OP(alu_op); wire [`NUM_THREADS-1:0][31:0] alu_in1 = mul_req_if.rs1_data; wire [`NUM_THREADS-1:0][31:0] alu_in2 = mul_req_if.rs2_data; @@ -81,7 +82,7 @@ module VX_mul_unit #( wire [MULQ_BITS-1:0] mul_tag; wire mul_valid_out; - wire mul_fire = mul_req_if.valid && mul_req_if.ready && ~`IS_DIV_OP(alu_op); + wire mul_fire = mul_req_if.valid && mul_req_if.ready && !is_div_op; VX_shift_register #( .DATAW(1 + MULQ_BITS + 1), @@ -96,88 +97,50 @@ module VX_mul_unit #( /////////////////////////////////////////////////////////////////////////// - wire [`NUM_THREADS-1:0][31:0] div_result; - wire is_div = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU); - wire is_signed_div = (alu_op == `MUL_DIV || alu_op == `MUL_REM); - reg [`NUM_THREADS-1:0] is_div_qual; - wire is_div_out; - wire stall_div; + wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp; - for (genvar i = 0; i < `NUM_THREADS; i++) begin - - reg [31:0] div_in1_qual, div_in2_qual; - reg [32:0] div_in1, div_in2; - wire [31:0] div_result_tmp, rem_result_tmp; - - // handle divide by zero - always @(*) begin - is_div_qual[i] = is_div; - div_in1_qual = alu_in1[i]; - div_in2_qual = alu_in2[i]; - if (0 == alu_in2[i]) begin - div_in2_qual = 1; - if (is_div) begin - div_in1_qual = 32'hFFFFFFFF; // quotient = (0xFFFFFFFF / 1) - end else begin - is_div_qual[i] = 1; // remainder = (in1 / 1) - end - end - end - - // latch divider inputs - always @(posedge clk) begin - if (~stall_div) begin - div_in1 <= {is_signed_div & alu_in1[i][31], div_in1_qual}; - div_in2 <= {is_signed_div & alu_in2[i][31], div_in2_qual}; - end - end - - VX_divide #( - .WIDTHN(33), - .WIDTHD(33), - .WIDTHQ(32), - .WIDTHR(32), - .NSIGNED(1), - .DSIGNED(1), - .PIPELINE(`LATENCY_IDIV) - ) divide ( - .clk(clk), - .reset(reset), - .clk_en(~stall_div), - .numer(div_in1), - .denom(div_in2), - .quotient(div_result_tmp), - .remainder(rem_result_tmp) - ); - - assign div_result[i] = is_div_out ? div_result_tmp : rem_result_tmp; - end - - wire [MULQ_BITS-1:0] div_tag; - wire div_valid_out; - - wire div_fire = mul_req_if.valid && mul_req_if.ready && `IS_DIV_OP(alu_op); - - VX_shift_register #( - .DATAW(1 + MULQ_BITS + 1), - .DEPTH(`LATENCY_IDIV + 1) - ) div_shift_reg ( + wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU); + wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM); + wire div_valid_in = mul_req_if.valid && is_div_op; + wire div_ready_in; + wire div_ready_out; + wire div_valid_out; + wire is_div_out; + wire [MULQ_BITS-1:0] div_tag; + + VX_serial_div #( + .WIDTHN(32), + .WIDTHD(32), + .WIDTHQ(32), + .WIDTHR(32), + .LANES(`NUM_THREADS), + .TAGW(MULQ_BITS + 1) + ) divide ( .clk(clk), .reset(reset), - .enable(~stall_div), - .in({div_fire, tag_in, (| is_div_qual)}), - .out({div_valid_out, div_tag, is_div_out}) + .ready_in(div_ready_in), + .valid_in(div_valid_in), + .signed_mode(is_signed_div), + .tag_in({tag_in, is_div_only}), + .numer(alu_in1), + .denom(alu_in2), + .quotient(div_result_tmp), + .remainder(rem_result_tmp), + .ready_out(div_ready_out), + .valid_out(div_valid_out), + .tag_out({div_tag, is_div_out}) ); - + + wire [`NUM_THREADS-1:0][31:0] div_result = is_div_out ? div_result_tmp : rem_result_tmp; + /////////////////////////////////////////////////////////////////////////// wire arbiter_hazard = mul_valid_out && div_valid_out; assign stall_out = ~mul_commit_if.ready && mul_commit_if.valid; - assign stall_mul = stall_out || mulq_full; - assign stall_div = stall_out || mulq_full - || arbiter_hazard; // arbitration prioritizes MUL - wire stall_in = stall_mul || stall_div; + assign stall_mul = (stall_out && !is_div_op) || mulq_full; + assign div_ready_out = ~stall_out && ~arbiter_hazard; // arbitration prioritizes MUL + wire stall_in = stall_mul || ~div_ready_in; assign valid_out = mul_valid_out || div_valid_out; assign tag_out = mul_valid_out ? mul_tag : div_tag; @@ -186,7 +149,7 @@ module VX_mul_unit #( VX_generic_register #( .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)) - ) alu_reg ( + ) mul_reg ( .clk (clk), .reset (reset), .stall (stall_out), diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 16d65aa3..93f0dd59 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -210,9 +210,10 @@ module VX_warp_sched #( wire [`NUM_WARPS-1:0] schedule_ready = schedule_table & ~(stalled_warps | total_barrier_stall | fetch_lock); always @(*) begin - schedule_valid = 0; - thread_mask = 'x; - warp_pc = 'x; + schedule_valid = 0; + thread_mask = 'x; + warp_pc = 'x; + warp_to_schedule = 'x; for (integer i = 0; i < `NUM_WARPS; ++i) begin if (schedule_ready[i]) begin schedule_valid = 1; diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index 84a8b49d..3660f897 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -74,7 +74,7 @@ module VX_fp_fpga #( .ready_in (per_core_ready_in[0]), .tag_in (tag_in), .op_type (op_type), - .frm (op_mod), + .frm (frm), .dataa (dataa), .datab (datab), .result (per_core_result[0]), @@ -278,14 +278,14 @@ module VX_fp_fpga #( always @(*) begin per_core_ready_out = 0; - valid_out_r = 0; - has_fflags_r = 0; - result_r = 'x; - tag_out_r = 'x; + valid_out_r = 0; + has_fflags_r = 'x; + result_r = 'x; + tag_out_r = 'x; for (integer i = 0; i < NUM_FPC; i++) begin if (per_core_valid_out[i]) begin per_core_ready_out[i] = 1; - valid_out_r = i; + valid_out_r = 1; has_fflags_r = fpnew_has_fflags && (i == 0); result_r = per_core_result[i]; tag_out_r = per_core_tag_out[i]; diff --git a/hw/rtl/libs/VX_serial_div.v b/hw/rtl/libs/VX_serial_div.v new file mode 100644 index 00000000..94a6a1f7 --- /dev/null +++ b/hw/rtl/libs/VX_serial_div.v @@ -0,0 +1,124 @@ +`include "VX_platform.vh" + +module VX_serial_div #( + parameter WIDTHN = 1, + parameter WIDTHD = 1, + parameter WIDTHQ = 1, + parameter WIDTHR = 1, + parameter LANES = 1, + parameter TAGW = 1 +) ( + input wire clk, + input wire reset, + + input wire valid_in, + output wire ready_in, + input wire [LANES-1:0][WIDTHN-1:0] numer, + input wire [LANES-1:0][WIDTHD-1:0] denom, + input wire signed_mode, + input wire [TAGW-1:0] tag_in, + + output wire [LANES-1:0][WIDTHQ-1:0] quotient, + output wire [LANES-1:0][WIDTHR-1:0] remainder, + input wire ready_out, + output wire valid_out, + output wire [TAGW-1:0] tag_out +); + localparam MIN_ND = (WIDTHN < WIDTHD) ? WIDTHN : WIDTHD; + localparam CNTRW = $clog2(WIDTHN+1); + + reg [LANES-1:0][WIDTHN + MIN_ND:0] working; + reg [LANES-1:0][WIDTHD-1:0] denom_r; + + wire [LANES-1:0][WIDTHN-1:0] numer_qual; + wire [LANES-1:0][WIDTHD-1:0] denom_qual; + wire [LANES-1:0][WIDTHD:0] sub_result; + + reg [LANES-1:0] inv_quot, inv_rem; + + reg [CNTRW-1:0] cntr; + reg is_busy; + + reg [TAGW-1:0] tag_r; + + wire done = ~(| cntr); + + wire push = valid_in && ready_in; + wire pop = valid_out && ready_out; + + for (genvar i = 0; i < LANES; ++i) begin + wire negate_numer = signed_mode && numer[i][WIDTHN-1]; + wire negate_denom = signed_mode && denom[i][WIDTHD-1]; + assign numer_qual[i] = (numer[i] ^ {WIDTHN{negate_numer}}) + WIDTHN'(negate_numer); + assign denom_qual[i] = (denom[i] ^ {WIDTHD{negate_denom}}) + WIDTHD'(negate_denom); + assign sub_result[i] = working[i][WIDTHN + MIN_ND : WIDTHN] - denom_r[i]; + end + + always @(posedge clk) begin + if (reset) begin + cntr <= 0; + is_busy <= 0; + end + else begin + if (push) begin + for (integer i = 0; i < LANES; ++i) begin + working[i] <= {{WIDTHD{1'b0}}, numer_qual[i], 1'b0}; + denom_r[i] <= denom_qual[i]; + inv_quot[i] <= (denom[i] != 0) && signed_mode && (numer[i][31] ^ denom[i][31]); + inv_rem[i] <= signed_mode && numer[i][31]; + end + tag_r <= tag_in; + cntr <= WIDTHN; + is_busy <= 1; + end + else begin + if (!done) begin + for (integer i = 0; i < LANES; ++i) begin + working[i] <= sub_result[i][WIDTHD] ? {working[i][WIDTHN+MIN_ND-1:0], 1'b0} : + {sub_result[i][WIDTHD-1:0], working[i][WIDTHN-1:0], 1'b1}; + end + cntr <= cntr - CNTRW'(1); + end + end + if (pop) begin + is_busy <= 0; + end + end + end + + for (genvar i = 0; i < LANES; ++i) begin + assign quotient[i] = (working[i][WIDTHQ-1:0] ^ {WIDTHQ{inv_quot[i]}}) + WIDTHQ'(inv_quot[i]); + assign remainder[i] = (working[i][WIDTHN+WIDTHR:WIDTHN+1] ^ {WIDTHR{inv_rem[i]}}) + WIDTHR'(inv_rem[i]); + end + assign ready_in = !is_busy; + assign tag_out = tag_r; + assign valid_out = is_busy && done; + + /*reg [LANES-1:0][WIDTHQ-1:0] quotient_r; + reg [LANES-1:0][WIDTHR-1:0] remainder_r; + reg [TAGW-1:0] tag_out_r; + reg valid_out_r; + + wire stall_out = !ready_out && valid_out_r; + assign pop = is_busy && done && !stall_out; + + always @(posedge clk) begin + if (reset) begin + valid_out_r <= 0; + end else if (~stall_out) begin + for (integer i = 0; i < LANES; ++i) begin + quotient_r[i] <= (working[i][WIDTHQ-1:0] ^ {WIDTHQ{inv_quot[i]}}) + WIDTHQ'(inv_quot[i]); + remainder_r[i] <= ((working[i][WIDTHN+WIDTHR-1:WIDTHN] >> 1) ^ {WIDTHR{inv_rem[i]}}) + WIDTHR'(inv_rem[i]); + end + tag_out_r <= tag_r; + valid_out_r <= is_busy && done; + end + end + + assign ready_in = !is_busy; + assign quotient = quotient_r; + assign remainder = remainder_r; + assign tag_out = tag_out_r; + assign valid_out = valid_out_r;*/ + +endmodule \ No newline at end of file