diff --git a/rtl/Makefile b/rtl/Makefile index b00b13dc..af234332 100644 --- a/rtl/Makefile +++ b/rtl/Makefile @@ -24,7 +24,7 @@ MAKECPP=(cd obj_dir && make -j -f VVortex.mk) # -LDFLAGS '-lsystemc' VERILATOR: echo "#define VCD_OFF" > simulate/tb_debug.h - verilator $(COMP) -cc $(FILE) $(INCLUDE) $(EXE) $(LIB) $(CF) $(LIGHTW) + verilator $(COMP) -cc $(FILE) $(INCLUDE) $(EXE) $(LIB) $(CF) VERILATORnoWarnings: echo "#define VCD_OFF" > simulate/tb_debug.h diff --git a/rtl/VX_back_end.v b/rtl/VX_back_end.v index 0a7b73f8..c5f635b2 100644 --- a/rtl/VX_back_end.v +++ b/rtl/VX_back_end.v @@ -4,7 +4,7 @@ module VX_back_end ( input wire schedule_delay, output wire out_mem_delay, - + output wire gpr_stage_delay, VX_jal_response_inter VX_jal_rsp, VX_branch_response_inter VX_branch_rsp, @@ -37,9 +37,6 @@ VX_mem_req_inter VX_exe_mem_req(); VX_mem_req_inter VX_mem_req(); -VX_gpr_data_inter VX_gpr_data(); - -VX_frE_to_bckE_req_inter VX_bckE_req_out(); // LSU input + output VX_lsu_req_inter VX_lsu_req(); @@ -63,18 +60,14 @@ VX_gpr_stage VX_gpr_stage( .schedule_delay (schedule_delay), .VX_writeback_inter(VX_writeback_temp), .VX_bckE_req (VX_bckE_req), - .VX_bckE_req_out (VX_bckE_req_out), - .VX_gpr_data (VX_gpr_data) - ); - - -VX_inst_multiplex VX_inst_mult( - .VX_bckE_req (VX_bckE_req_out), - .VX_gpr_data (VX_gpr_data), + // New .VX_exec_unit_req(VX_exec_unit_req), .VX_lsu_req (VX_lsu_req), .VX_gpu_inst_req (VX_gpu_inst_req), - .VX_csr_req (VX_csr_req) + .VX_csr_req (VX_csr_req), + // End new + .memory_delay (out_mem_delay), + .gpr_stage_delay (gpr_stage_delay) ); diff --git a/rtl/VX_execute_unit.v b/rtl/VX_execute_unit.v index 99751fc4..c2008098 100644 --- a/rtl/VX_execute_unit.v +++ b/rtl/VX_execute_unit.v @@ -58,9 +58,9 @@ module VX_execute_unit ( endgenerate - wire [`NW_M1:0] branch_use_index; - wire branch_found_valid; - VX_priority_encoder choose_alu_result( + wire [$clog2(`NT)-1:0] branch_use_index; + wire branch_found_valid; + VX_generic_priority_encoder #(.N(`NT)) choose_alu_result( .valids(VX_exec_unit_req.valid), .index (branch_use_index), .found (branch_found_valid) diff --git a/rtl/VX_generic_priority_encoder.v b/rtl/VX_generic_priority_encoder.v new file mode 100644 index 00000000..aad7e98c --- /dev/null +++ b/rtl/VX_generic_priority_encoder.v @@ -0,0 +1,22 @@ +module VX_generic_priority_encoder + #( + parameter N = 1 + ) + ( + input wire[N-1:0] valids, + output reg[$clog2(N)-1:0] index, + output reg found + ); + + integer i; + always @(*) begin + index = 0; + found = 0; + for (i = `NW-1; i >= 0; i = i - 1) begin + if (valids[i]) begin + index = i[$clog2(N)-1:0]; + found = 1; + end + end + end +endmodule \ No newline at end of file diff --git a/rtl/VX_gpgpu_inst.v b/rtl/VX_gpgpu_inst.v index c468dc39..d5c0e7b5 100644 --- a/rtl/VX_gpgpu_inst.v +++ b/rtl/VX_gpgpu_inst.v @@ -39,7 +39,9 @@ module VX_gpgpu_inst ( assign VX_warp_ctl.is_barrier = VX_gpu_inst_req.is_barrier && valid_inst; assign VX_warp_ctl.barrier_id = VX_gpu_inst_req.a_reg_data[0]; - assign VX_warp_ctl.num_warps = VX_gpu_inst_req.rd2 - 1; + + wire[31:0] num_warps_m1 = VX_gpu_inst_req.rd2 - 1; + assign VX_warp_ctl.num_warps = num_warps_m1[$clog2(`NW):0]; assign VX_warp_ctl.wspawn = wspawn; assign VX_warp_ctl.wspawn_pc = wspawn_pc; @@ -58,11 +60,7 @@ module VX_gpgpu_inst ( end - wire[`NW_M1:0] num_valids; - VX_one_counter one_counter( - .valids (curr_valids), - .ones_found(num_valids) - ); + wire[`NW_M1:0] num_valids = $countones(curr_valids); assign VX_warp_ctl.is_split = is_split && (num_valids > 1) && (split_new_use_mask != 0) && (split_new_use_mask != {`NT{1'b1}}); diff --git a/rtl/VX_gpr_stage.v b/rtl/VX_gpr_stage.v index 0e50ebe8..db531ca7 100644 --- a/rtl/VX_gpr_stage.v +++ b/rtl/VX_gpr_stage.v @@ -1,7 +1,14 @@ + +`include "VX_define.v" + module VX_gpr_stage ( input wire clk, input wire reset, input wire schedule_delay, + + input wire memory_delay, + output wire gpr_stage_delay, + // inputs // Instruction Information VX_frE_to_bckE_req_inter VX_bckE_req, @@ -12,16 +19,20 @@ module VX_gpr_stage ( // Outputs - // Original Request 1 cycle later - VX_frE_to_bckE_req_inter VX_bckE_req_out, - // Data Read - VX_gpr_data_inter VX_gpr_data + VX_exec_unit_req_inter VX_exec_unit_req, + VX_lsu_req_inter VX_lsu_req, + VX_gpu_inst_req_inter VX_gpu_inst_req, + VX_csr_req_inter VX_csr_req ); wire[31:0] curr_PC = VX_bckE_req.curr_PC; wire[2:0] branchType = VX_bckE_req.branch_type; + wire is_store = (VX_bckE_req.mem_write != `NO_MEM_WRITE); + wire is_load = (VX_bckE_req.mem_read != `NO_MEM_READ); + + wire jalQual = VX_bckE_req.jalQual; VX_gpr_read_inter VX_gpr_read(); @@ -50,28 +61,93 @@ module VX_gpr_stage ( // assign VX_bckE_req.is_csr = is_csr; // assign VX_bckE_req_out.csr_mask = (VX_bckE_req.sr_immed == 1'b1) ? {27'h0, VX_bckE_req.rs1} : VX_gpr_data.a_reg_data[0]; - wire zero_temp = 0; + // Outputs + VX_exec_unit_req_inter VX_exec_unit_req_temp(); + VX_lsu_req_inter VX_lsu_req_temp(); + VX_gpu_inst_req_inter VX_gpu_inst_req_temp(); + VX_csr_req_inter VX_csr_req_temp(); - VX_generic_register #(.N(256)) reg_data - ( - .clk (clk), - .reset(reset), - .stall(zero_temp), - .flush(zero_temp), - .in ({VX_gpr_datf.a_reg_data, VX_gpr_datf.b_reg_data}), - .out ({VX_gpr_data.a_reg_data, VX_gpr_data.b_reg_data}) - ); - - wire stall = schedule_delay; - - - VX_d_e_reg gpr_stage_reg( - .clk (clk), - .reset (reset), - .in_branch_stall (stall), - .in_freeze (zero_temp), - .VX_frE_to_bckE_req(VX_bckE_req), - .VX_bckE_req (VX_bckE_req_out) + VX_inst_multiplex VX_inst_mult( + .VX_bckE_req (VX_bckE_req), + .VX_gpr_data (VX_gpr_datf), + .VX_exec_unit_req(VX_exec_unit_req_temp), + .VX_lsu_req (VX_lsu_req_temp), + .VX_gpu_inst_req (VX_gpu_inst_req_temp), + .VX_csr_req (VX_csr_req_temp) ); + wire is_lsu = (|VX_lsu_req_temp.valid); + + wire stall_rest = 0; + wire flush_rest = schedule_delay; + + + wire stall_lsu = is_lsu && memory_delay; + wire flush_lsu = schedule_delay && !stall_lsu; + + + assign gpr_stage_delay = stall_lsu; + + + VX_generic_register #(.N(308)) lsu_reg( + .clk (clk), + .reset(reset), + .stall(stall_lsu), + .flush(flush_lsu), + .in ({VX_lsu_req_temp.valid, VX_lsu_req_temp.warp_num, VX_lsu_req_temp.store_data, VX_lsu_req_temp.base_address, VX_lsu_req_temp.offset, VX_lsu_req_temp.mem_read, VX_lsu_req_temp.mem_write, VX_lsu_req_temp.rd, VX_lsu_req_temp.wb}), + .out ({VX_lsu_req.valid , VX_lsu_req.warp_num , VX_lsu_req.store_data , VX_lsu_req.base_address , VX_lsu_req.offset , VX_lsu_req.mem_read , VX_lsu_req.mem_write , VX_lsu_req.rd , VX_lsu_req.wb }) + ); + + VX_generic_register #(.N(487)) exec_unit_reg( + .clk (clk), + .reset(reset), + .stall(stall_rest), + .flush(flush_rest), + .in ({VX_exec_unit_req_temp.valid, VX_exec_unit_req_temp.warp_num, VX_exec_unit_req_temp.curr_PC, VX_exec_unit_req_temp.PC_next, VX_exec_unit_req_temp.rd, VX_exec_unit_req_temp.wb, VX_exec_unit_req_temp.a_reg_data, VX_exec_unit_req_temp.b_reg_data, VX_exec_unit_req_temp.alu_op, VX_exec_unit_req_temp.rs1, VX_exec_unit_req_temp.rs2, VX_exec_unit_req_temp.rs2_src, VX_exec_unit_req_temp.itype_immed, VX_exec_unit_req_temp.upper_immed, VX_exec_unit_req_temp.branch_type, VX_exec_unit_req_temp.jalQual, VX_exec_unit_req_temp.jal, VX_exec_unit_req_temp.jal_offset, VX_exec_unit_req_temp.ebreak, VX_exec_unit_req_temp.wspawn, VX_exec_unit_req_temp.is_csr, VX_exec_unit_req_temp.csr_address, VX_exec_unit_req_temp.csr_immed, VX_exec_unit_req_temp.csr_mask}), + .out ({VX_exec_unit_req.valid , VX_exec_unit_req.warp_num , VX_exec_unit_req.curr_PC , VX_exec_unit_req.PC_next , VX_exec_unit_req.rd , VX_exec_unit_req.wb , VX_exec_unit_req.a_reg_data , VX_exec_unit_req.b_reg_data , VX_exec_unit_req.alu_op , VX_exec_unit_req.rs1 , VX_exec_unit_req.rs2 , VX_exec_unit_req.rs2_src , VX_exec_unit_req.itype_immed , VX_exec_unit_req.upper_immed , VX_exec_unit_req.branch_type , VX_exec_unit_req.jalQual , VX_exec_unit_req.jal , VX_exec_unit_req.jal_offset , VX_exec_unit_req.ebreak , VX_exec_unit_req.wspawn , VX_exec_unit_req.is_csr , VX_exec_unit_req.csr_address , VX_exec_unit_req.csr_immed , VX_exec_unit_req.csr_mask }) + ); + + VX_generic_register #(.N(203)) gpu_inst_reg( + .clk (clk), + .reset(reset), + .stall(stall_rest), + .flush(flush_rest), + .in ({VX_gpu_inst_req_temp.valid, VX_gpu_inst_req_temp.warp_num, VX_gpu_inst_req_temp.is_wspawn, VX_gpu_inst_req_temp.is_tmc, VX_gpu_inst_req_temp.is_split, VX_gpu_inst_req_temp.is_barrier, VX_gpu_inst_req_temp.pc_next, VX_gpu_inst_req_temp.a_reg_data, VX_gpu_inst_req_temp.rd2}), + .out ({VX_gpu_inst_req.valid , VX_gpu_inst_req.warp_num , VX_gpu_inst_req.is_wspawn , VX_gpu_inst_req.is_tmc , VX_gpu_inst_req.is_split , VX_gpu_inst_req.is_barrier , VX_gpu_inst_req.pc_next , VX_gpu_inst_req.a_reg_data , VX_gpu_inst_req.rd2 }) + ); + + VX_generic_register #(.N(60)) csr_reg( + .clk (clk), + .reset(reset), + .stall(stall_rest), + .flush(flush_rest), + .in ({VX_csr_req_temp.valid, VX_csr_req_temp.warp_num, VX_csr_req_temp.rd, VX_csr_req_temp.wb, VX_csr_req_temp.is_csr, VX_csr_req_temp.csr_address, VX_csr_req_temp.csr_immed, VX_csr_req_temp.csr_mask}), + .out ({VX_csr_req.valid , VX_csr_req.warp_num , VX_csr_req.rd , VX_csr_req.wb , VX_csr_req.is_csr , VX_csr_req.csr_address , VX_csr_req.csr_immed , VX_csr_req.csr_mask }) + ); + + + // wire zero_temp = 0; + + // VX_generic_register #(.N(256)) reg_data + // ( + // .clk (clk), + // .reset(reset), + // .stall(zero_temp), + // .flush(zero_temp), + // .in ({VX_gpr_datf.a_reg_data, VX_gpr_datf.b_reg_data}), + // .out ({VX_gpr_data.a_reg_data, VX_gpr_data.b_reg_data}) + // ); + + // wire stall = schedule_delay; + + + // VX_d_e_reg gpr_stage_reg( + // .clk (clk), + // .reset (reset), + // .in_branch_stall (stall), + // .in_freeze (zero_temp), + // .VX_frE_to_bckE_req(VX_bckE_req), + // .VX_bckE_req (VX_bckE_req_out) + // ); + endmodule \ No newline at end of file diff --git a/rtl/VX_scheduler.v b/rtl/VX_scheduler.v index 6d1476d8..0a563201 100644 --- a/rtl/VX_scheduler.v +++ b/rtl/VX_scheduler.v @@ -6,6 +6,7 @@ module VX_scheduler ( input wire clk, input wire reset, input wire memory_delay, + input wire gpr_stage_delay, VX_frE_to_bckE_req_inter VX_bckE_req, VX_wb_inter VX_writeback_inter, @@ -28,14 +29,17 @@ module VX_scheduler ( wire rs2_rename = rename_table[VX_bckE_req.rs2]; wire is_store = (VX_bckE_req.mem_write != `NO_MEM_WRITE); + wire is_load = (VX_bckE_req.mem_read != `NO_MEM_READ); + + wire is_mem = is_store || is_load; wire rs1_rename_qual = (rs1_rename && (VX_bckE_req.rs1 != 0)); - wire rs2_rename_qual = (rs2_rename && (VX_bckE_req.rs2 != 0) && ((VX_bckE_req.rs2_src == `RS2_REG) || is_store)); + wire rs2_rename_qual = (rs2_rename && (VX_bckE_req.rs2 != 0) && ((VX_bckE_req.rs2_src == `RS2_REG) || is_store)) || (VX_bckE_req.is_barrier) || (VX_bckE_req.is_wspawn); wire rename_valid = rs1_rename_qual || rs2_rename_qual ; - assign schedule_delay = (rename_valid) && (|VX_bckE_req.valid) || memory_delay; + assign schedule_delay = (rename_valid) && (|VX_bckE_req.valid) || (memory_delay && (is_mem)) || (gpr_stage_delay && is_mem); integer i; diff --git a/rtl/VX_warp_scheduler.v b/rtl/VX_warp_scheduler.v index d5e09584..8c9a6564 100644 --- a/rtl/VX_warp_scheduler.v +++ b/rtl/VX_warp_scheduler.v @@ -18,10 +18,10 @@ module VX_warp_scheduler ( input wire whalt, input wire[`NW_M1:0] whalt_warp_num, - input wire is_barrier, - input wire[31:0] barrier_id, - input wire[`NW_M1:0] num_warps, - input wire[`NW_M1:0] barrier_warp_num, + input wire is_barrier, + input wire[31:0] barrier_id, + input wire[$clog2(`NW):0] num_warps, + input wire[`NW_M1:0] barrier_warp_num, // WSTALL input wire wstall, @@ -86,7 +86,7 @@ module VX_warp_scheduler ( reg[`NW-1:0] barrier_stall_mask[(`NUM_BARRIERS-1):0]; wire reached_barrier_limit; wire[`NW-1:0] curr_barrier_mask; - wire[($clog2(`NUM_BARRIERS)-1):0] curr_barrier_count; + wire[$clog2(`NW):0] curr_barrier_count; // wsapwn reg[31:0] use_wsapwn_pc; @@ -141,41 +141,35 @@ module VX_warp_scheduler ( end else begin barrier_stall_mask[barrier_id][barrier_warp_num] <= 1; end + end else if (ctm) begin + thread_masks[ctm_warp_num] <= ctm_mask; + warp_stalled[ctm_warp_num] <= 0; + end else if (is_join) begin + if (!join_fall) begin + warp_pcs[join_warp_num] <= join_pc; + end + thread_masks[join_warp_num] <= join_tm; + end else if (is_split) begin + warp_stalled[split_warp_num] <= 0; + thread_masks[split_warp_num] <= split_new_mask; end - - if (update_use_wspawn) begin - use_wsapwn[warp_to_schedule] <= 0; - end - // Halting warps + if (whalt) begin warp_active[whalt_warp_num] <= 0; visible_active[whalt_warp_num] <= 0; end - // Changing thread masks - if (ctm) begin - thread_masks[ctm_warp_num] <= ctm_mask; - warp_stalled[ctm_warp_num] <= 0; + if (update_use_wspawn) begin + use_wsapwn[warp_to_schedule] <= 0; end + // Stalling the scheduling of warps if (wstall) begin warp_stalled[wstall_warp_num] <= 1; visible_active[wstall_warp_num] <= 0; end - if (is_split) begin - warp_stalled[split_warp_num] <= 0; - thread_masks[split_warp_num] <= split_new_mask; - end - - if (is_join) begin - if (!join_fall) begin - warp_pcs[join_warp_num] <= join_pc; - end - thread_masks[join_warp_num] <= join_tm; - end - // Refilling active warps if (update_visible_active) begin visible_active <= warp_active & (~warp_stalled) & (~total_barrier_stall); diff --git a/rtl/Vortex.v b/rtl/Vortex.v index 1ba83983..07d2b14b 100644 --- a/rtl/Vortex.v +++ b/rtl/Vortex.v @@ -58,6 +58,7 @@ VX_warp_ctl_inter VX_warp_ctl(); wire memory_delay; +wire gpr_stage_delay; wire schedule_delay; @@ -78,6 +79,7 @@ VX_scheduler schedule( .clk (clk), .reset (reset), .memory_delay (memory_delay), + .gpr_stage_delay (gpr_stage_delay), .VX_bckE_req (VX_bckE_req), .VX_writeback_inter(VX_writeback_inter), .schedule_delay (schedule_delay) @@ -94,7 +96,8 @@ VX_back_end vx_back_end( .VX_dcache_rsp (VX_dcache_rsp), .VX_dcache_req (VX_dcache_req), .VX_writeback_inter (VX_writeback_inter), - .out_mem_delay (memory_delay) + .out_mem_delay (memory_delay), + .gpr_stage_delay (gpr_stage_delay) ); // VX_csr_handler vx_csr_handler( diff --git a/rtl/interfaces/VX_warp_ctl_inter.v b/rtl/interfaces/VX_warp_ctl_inter.v index a6697de9..958f585b 100644 --- a/rtl/interfaces/VX_warp_ctl_inter.v +++ b/rtl/interfaces/VX_warp_ctl_inter.v @@ -18,9 +18,9 @@ interface VX_warp_ctl_inter (); wire ebreak; // barrier - wire is_barrier; - wire[31:0] barrier_id; - wire[`NW_M1:0] num_warps; + wire is_barrier; + wire[31:0] barrier_id; + wire[$clog2(`NW):0] num_warps; wire is_split; wire[`NW_M1:0] split_warp_num; diff --git a/syn/syn.tcl b/syn/syn.tcl index 1877bfa8..eb766360 100755 --- a/syn/syn.tcl +++ b/syn/syn.tcl @@ -3,7 +3,7 @@ set link_library [concat * sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_ set symbol_library {} set target_library [concat sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_m40c.db] -set verilog_files [ list VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \ +set verilog_files [ list VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \ ] analyze -format sverilog $verilog_files