From b211b29670ac80f35336c3acee5d88e6d651bf1a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 25 Aug 2020 14:02:35 -0700 Subject: [PATCH] removing pipeline additional registers --- hw/rtl/VX_alu_unit.v | 92 ++++++++++++------------------- hw/rtl/VX_gpu_unit.v | 33 +++++------ hw/rtl/VX_instr_demux.v | 26 +++++---- hw/rtl/VX_scoreboard.v | 6 +- hw/rtl/fp_cores/VX_fp_fpga.v | 2 +- hw/rtl/fp_cores/VX_fp_noncomp.v | 2 +- hw/rtl/interfaces/VX_alu_req_if.v | 1 + hw/rtl/interfaces/VX_gpu_req_if.v | 1 + hw/simulate/testbench.cpp | 2 +- 9 files changed, 74 insertions(+), 91 deletions(-) diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 19b0df48..b813c882 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -19,19 +19,6 @@ module VX_alu_unit #( reg [`NUM_THREADS-1:0][31:0] shift_result; reg [`NUM_THREADS-1:0][31:0] misc_result; - wire valid_r; - wire [`NW_BITS-1:0] wid_r; - wire [`NUM_THREADS-1:0] thread_mask_r; - wire [31:0] curr_PC_r; - wire [`NR_BITS-1:0] rd_r; - wire wb_r; - wire [`NT_BITS-1:0] tid_r; - wire is_sub_r; - wire [`BR_BITS-1:0] br_op_r; - wire is_br_op_r, is_br_op_s; - wire [1:0] alu_op_class_r; - wire [31:0] next_PC_r; - wire is_br_op = alu_req_if.is_br_op; wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op_type); wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.op_type); @@ -47,16 +34,16 @@ module VX_alu_unit #( wire [`NUM_THREADS-1:0][31:0] alu_in2_less = (alu_req_if.rs2_is_imm && ~is_br_op) ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2; for (genvar i = 0; i < `NUM_THREADS; i++) begin - always @(posedge clk) begin - add_result[i] <= alu_in1_PC[i] + alu_in2_imm[i]; + always @(*) begin + add_result[i] = alu_in1_PC[i] + alu_in2_imm[i]; end end for (genvar i = 0; i < `NUM_THREADS; i++) begin wire [32:0] sub_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]}; wire [32:0] sub_in2 = {alu_signed & alu_in2_less[i][31], alu_in2_less[i]}; - always @(posedge clk) begin - sub_result[i] <= $signed(sub_in1) - $signed(sub_in2); + always @(*) begin + sub_result[i] = $signed(sub_in1) - $signed(sub_in2); end end @@ -65,79 +52,70 @@ module VX_alu_unit #( `IGNORE_WARNINGS_BEGIN wire [32:0] shift_value = $signed(shift_in1) >>> alu_in2_imm[i][4:0]; `IGNORE_WARNINGS_END - always @(posedge clk) begin - shift_result[i] <= shift_value[31:0]; + always @(*) begin + shift_result[i] = shift_value[31:0]; end end for (genvar i = 0; i < `NUM_THREADS; i++) begin - always @(posedge clk) begin + always @(*) begin case (alu_op) - `ALU_AND: misc_result[i] <= alu_in1[i] & alu_in2_imm[i]; - `ALU_OR: misc_result[i] <= alu_in1[i] | alu_in2_imm[i]; - `ALU_XOR: misc_result[i] <= alu_in1[i] ^ alu_in2_imm[i]; + `ALU_AND: misc_result[i] = alu_in1[i] & alu_in2_imm[i]; + `ALU_OR: misc_result[i] = alu_in1[i] | alu_in2_imm[i]; + `ALU_XOR: misc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; //`ALU_SLL, - default: misc_result[i] <= alu_in1[i] << alu_in2_imm[i][4:0]; + default: misc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0]; endcase end end - - wire [31:0] next_PC = alu_req_if.curr_PC + 4; - - VX_shift_register #( - .DATAW(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `NT_BITS + 1 + 1 + `BR_BITS + 2 + 32), - .DEPTH(1) - ) alu_shift_reg ( - .clk(clk), - .reset(reset), - .enable(alu_req_if.ready), - .in({alu_req_if.valid, alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, is_sub, is_br_op, br_op, alu_op_class, next_PC}), - .out({valid_r, wid_r, thread_mask_r, curr_PC_r, rd_r, wb_r, tid_r, is_sub_r, is_br_op_r, br_op_r, alu_op_class_r, next_PC_r}) - ); for (genvar i = 0; i < `NUM_THREADS; i++) begin always @(*) begin - case (alu_op_class_r) - 0: alu_result[i] = is_sub_r ? sub_result[i][31:0] : add_result[i]; + case (alu_op_class) + 0: alu_result[i] = is_sub ? sub_result[i][31:0] : add_result[i]; 1: alu_result[i] = {31'b0, sub_result[i][32]}; 2: alu_result[i] = shift_result[i]; default: alu_result[i] = misc_result[i]; endcase end end - - // branch handling - - wire br_neg = `BR_NEG(br_op_r); - wire br_less = `BR_LESS(br_op_r); - wire br_static = `BR_STATIC(br_op_r); - wire is_jal = is_br_op_r && (br_op_r == `BR_JAL || br_op_r == `BR_JALR); - - wire [31:0] br_dest = add_result[tid_r]; - wire [32:0] cmp_result = sub_result[tid_r]; - wire is_less = cmp_result[32]; - wire is_equal = ~(| cmp_result[31:0]); - wire br_taken = ((br_less ? is_less : is_equal) ^ br_neg) | br_static; - wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{next_PC_r}} : alu_result; + wire is_jal = is_br_op && (br_op == `BR_JAL || br_op == `BR_JALR); + wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result; + + wire [31:0] br_dest = add_result[alu_req_if.tid]; + wire [32:0] cmp_result = sub_result[alu_req_if.tid]; + wire is_less = cmp_result[32]; + wire is_equal = ~(| cmp_result[31:0]); + + wire is_br_op_r, is_less_r, is_equal_r; +`IGNORE_WARNINGS_BEGIN + wire [`BR_BITS-1:0] br_op_r; +`IGNORE_WARNINGS_END // output wire stall_out = ~alu_commit_if.ready && alu_commit_if.valid; VX_generic_register #( - .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + 1 + 32) + .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `BR_BITS + 1 + 1 + 32) ) alu_reg ( .clk (clk), .reset (reset), .stall (stall_out), .flush (1'b0), - .in ({valid_r, wid_r, thread_mask_r, curr_PC_r, rd_r, wb_r, alu_jal_result, is_br_op_r, br_taken, br_dest}), - .out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.thread_mask, alu_commit_if.curr_PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_s, branch_ctl_if.taken, branch_ctl_if.dest}) + .in ({alu_req_if.valid, alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}), + .out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.thread_mask, alu_commit_if.curr_PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, br_op_r, is_less_r, is_equal_r, branch_ctl_if.dest}) ); - assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_s; + wire br_neg = `BR_NEG(br_op_r); + wire br_less = `BR_LESS(br_op_r); + wire br_static = `BR_STATIC(br_op_r); + wire br_taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static; + + assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_r; assign branch_ctl_if.wid = alu_commit_if.wid; + assign branch_ctl_if.taken = br_taken; // can accept new request? assign alu_req_if.ready = ~stall_out; diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index 44175a4a..3f39d874 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -13,6 +13,9 @@ module VX_gpu_unit #( VX_warp_ctl_if warp_ctl_if, VX_exu_to_cmt_if gpu_commit_if ); + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + gpu_tmc_t tmc; gpu_wspawn_t wspawn; gpu_barrier_t barrier; @@ -58,7 +61,7 @@ module VX_gpu_unit #( assign split.diverged = (| split_then_mask) && (| split_else_mask); assign split.then_mask = split_then_mask; assign split.else_mask = split_else_mask; - assign split.pc = gpu_req_if.curr_PC + 4; + assign split.pc = gpu_req_if.next_PC; // barrier @@ -68,23 +71,21 @@ module VX_gpu_unit #( // output - wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid; + assign warp_ctl_if.valid = gpu_req_if.valid && gpu_commit_if.ready; + assign warp_ctl_if.wid = gpu_commit_if.wid; + assign warp_ctl_if.tmc = tmc; + assign warp_ctl_if.wspawn = wspawn; + assign warp_ctl_if.split = split; + assign warp_ctl_if.barrier = barrier; - VX_generic_register #( - .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + $bits(gpu_tmc_t) + $bits(gpu_wspawn_t) + $bits(gpu_split_t) + $bits(gpu_barrier_t)) - ) gpu_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (1'b0), - .in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}), - .out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.thread_mask, gpu_commit_if.curr_PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier}) - ); - - assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready; - assign warp_ctl_if.wid = gpu_commit_if.wid; + assign gpu_commit_if.valid = gpu_req_if.valid; + assign gpu_commit_if.wid = gpu_req_if.wid; + assign gpu_commit_if.thread_mask = gpu_req_if.thread_mask; + assign gpu_commit_if.curr_PC = gpu_req_if.curr_PC; + assign gpu_commit_if.rd = gpu_req_if.rd; + assign gpu_commit_if.wb = gpu_req_if.wb; // can accept new request? - assign gpu_req_if.ready = ~stall; + assign gpu_req_if.ready = gpu_commit_if.ready; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 0755b69f..44fb1e07 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -17,12 +17,6 @@ module VX_instr_demux ( VX_fpu_req_if fpu_req_if, VX_gpu_req_if gpu_req_if ); - // ALU unit - - wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU); - wire alu_req_ready; - wire is_br_op = `IS_BR_MOD(execute_if.op_mod); - wire [`NT_BITS-1:0] tid; VX_priority_encoder #( .N(`NUM_THREADS) @@ -32,15 +26,23 @@ module VX_instr_demux ( `UNUSED_PIN (valid_out) ); + wire [31:0] next_PC = execute_if.curr_PC + 4; + + // ALU unit + + wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU); + wire alu_req_ready; + wire is_br_op = `IS_BR_MOD(execute_if.op_mod); + VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `ALU_BR_BITS + 1 + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BR_BITS + 1 + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS) ) alu_reg ( .clk (clk), .reset (reset), .ready_in (alu_req_ready), .valid_in (alu_req_valid), - .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `ALU_BR_OP(execute_if.op_type), is_br_op, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}), - .data_out ({alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.op_type, alu_req_if.is_br_op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid}), + .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, next_PC, `ALU_BR_OP(execute_if.op_type), is_br_op, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}), + .data_out ({alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.is_br_op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid}), .ready_out (alu_req_if.ready), .valid_out (alu_req_if.valid) ); @@ -196,14 +198,14 @@ module VX_instr_demux ( wire gpu_req_ready; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `GPU_BITS + `NR_BITS + 1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1) ) gpu_reg ( .clk (clk), .reset (reset), .ready_in (gpu_req_ready), .valid_in (gpu_req_valid), - .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb}), - .data_out ({gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb}), + .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, next_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb}), + .data_out ({gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb}), .ready_out (gpu_req_if.ready), .valid_out (gpu_req_if.valid) ); diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 039843e7..1e31973e 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -16,9 +16,9 @@ module VX_scoreboard #( reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0]; reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; - wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs; + wire [`NUM_REGS-1:0] inuse_regs = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs; - assign delay = (| inuse_mask); + assign delay = (| inuse_regs); wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && (ibuf_deq_if.wb != 0); @@ -55,7 +55,7 @@ module VX_scoreboard #( if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin $display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b, exe=%b, gpr=%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.curr_PC, ibuf_deq_if.rd, ibuf_deq_if.wb, - inuse_mask[ibuf_deq_if.rd], inuse_mask[ibuf_deq_if.rs1], inuse_mask[ibuf_deq_if.rs2], inuse_mask[ibuf_deq_if.rs3], exe_delay, gpr_delay); + inuse_reg_mask[ibuf_deq_if.rd], inuse_reg_mask[ibuf_deq_if.rs1], inuse_reg_mask[ibuf_deq_if.rs2], inuse_reg_mask[ibuf_deq_if.rs3], exe_delay, gpr_delay); end end `endif diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index 3660f897..2097ffd1 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -284,7 +284,7 @@ module VX_fp_fpga #( tag_out_r = 'x; for (integer i = 0; i < NUM_FPC; i++) begin if (per_core_valid_out[i]) begin - per_core_ready_out[i] = 1; + per_core_ready_out[i] = ready_out; valid_out_r = 1; has_fflags_r = fpnew_has_fflags && (i == 0); result_r = per_core_result[i]; diff --git a/hw/rtl/fp_cores/VX_fp_noncomp.v b/hw/rtl/fp_cores/VX_fp_noncomp.v index a8b21853..45f7335b 100644 --- a/hw/rtl/fp_cores/VX_fp_noncomp.v +++ b/hw/rtl/fp_cores/VX_fp_noncomp.v @@ -82,7 +82,7 @@ module VX_fp_noncomp #( .o_type(tmp_b_type) ); - wire tmp_a_smaller = (dataa[i] < datab[i]) ^ (tmp_a_sign || tmp_b_sign); + wire tmp_a_smaller = $signed(dataa[i]) < $signed(datab[i]); wire tmp_ab_equal = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]); always @(posedge clk) begin diff --git a/hw/rtl/interfaces/VX_alu_req_if.v b/hw/rtl/interfaces/VX_alu_req_if.v index 7c32345b..bb964249 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.v +++ b/hw/rtl/interfaces/VX_alu_req_if.v @@ -10,6 +10,7 @@ interface VX_alu_req_if (); wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; + wire [31:0] next_PC; wire [`ALU_BR_BITS-1:0] op_type; wire is_br_op; wire rs1_is_PC; diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v index 42115cd8..ef55c442 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -10,6 +10,7 @@ interface VX_gpu_req_if(); wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; + wire [31:0] next_PC; wire [`GPU_BITS-1:0] op_type; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [31:0] rs2_data; diff --git a/hw/simulate/testbench.cpp b/hw/simulate/testbench.cpp index 18c1c887..554974a5 100644 --- a/hw/simulate/testbench.cpp +++ b/hw/simulate/testbench.cpp @@ -3,7 +3,7 @@ #include #include -#define ALL_TESTS +//#define ALL_TESTS int main(int argc, char **argv) { if (argc == 1) {