diff --git a/rtl/VX_decode.v b/rtl/VX_decode.v index 88fc9884..5b179382 100644 --- a/rtl/VX_decode.v +++ b/rtl/VX_decode.v @@ -8,6 +8,7 @@ module VX_decode( // Outputs VX_frE_to_bckE_req_inter VX_frE_to_bckE_req, VX_wstall_inter VX_wstall, + VX_join_inter VX_join, output wire out_ebreak ); @@ -118,6 +119,11 @@ module VX_decode( assign is_split = is_gpgpu && (func3 == 2); // Goes to BE assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE + + assign VX_join.is_join = is_join; + assign VX_join.join_warp_num = in_warp_num; + + assign VX_frE_to_bckE_req.is_wspawn = is_wspawn; assign VX_frE_to_bckE_req.is_tmc = is_tmc; assign VX_frE_to_bckE_req.is_split = is_split; @@ -283,7 +289,7 @@ module VX_decode( assign VX_frE_to_bckE_req.branch_type = temp_branch_type; - assign VX_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_join || is_barrier) && (|in_valid); + assign VX_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_barrier) && (|in_valid); assign VX_wstall.warp_num = in_warp_num; always @(*) begin diff --git a/rtl/VX_fetch.v b/rtl/VX_fetch.v index 6ef5539b..927e288b 100644 --- a/rtl/VX_fetch.v +++ b/rtl/VX_fetch.v @@ -4,6 +4,7 @@ module VX_fetch ( input wire clk, VX_wstall_inter VX_wstall, + VX_join_inter VX_join, input wire schedule_delay, VX_icache_response_inter icache_response, VX_icache_request_inter icache_request, @@ -42,11 +43,16 @@ module VX_fetch ( .wstall (VX_wstall.wstall), .wstall_warp_num(VX_wstall.warp_num), + // Join + .is_join (VX_join.is_join), + .join_warp_num (VX_join.join_warp_num), + // Split .is_split (VX_warp_ctl.is_split), .split_new_mask (VX_warp_ctl.split_new_mask), .split_later_mask(VX_warp_ctl.split_later_mask), .split_save_pc (VX_warp_ctl.split_save_pc), + .split_warp_num (VX_warp_ctl.warp_num), // JAL .jal (VX_jal_rsp.jal), diff --git a/rtl/VX_front_end.v b/rtl/VX_front_end.v index ce6f7716..196ee71b 100644 --- a/rtl/VX_front_end.v +++ b/rtl/VX_front_end.v @@ -37,10 +37,12 @@ wire real_fetch_ebreak; VX_wstall_inter VX_wstall(); +VX_join_inter VX_join(); VX_fetch vx_fetch( .clk (clk), .VX_wstall (VX_wstall), + .VX_join (VX_join), .schedule_delay (schedule_delay), .VX_jal_rsp (VX_jal_rsp), .icache_response (icache_response_fe), @@ -65,6 +67,7 @@ VX_decode vx_decode( .fd_inst_meta_de (fd_inst_meta_de), .VX_frE_to_bckE_req(VX_frE_to_bckE_req), .VX_wstall (VX_wstall), + .VX_join (VX_join), .out_ebreak (fetch_ebreak) ); diff --git a/rtl/VX_generic_stack.v b/rtl/VX_generic_stack.v index de5f0529..8bbf2ba8 100644 --- a/rtl/VX_generic_stack.v +++ b/rtl/VX_generic_stack.v @@ -5,11 +5,11 @@ module VX_generic_stack ) ( input wire clk, - input wire reset, input wire push, input wire pop, - input wire[WIDTH - 1:0] d, - output reg [WIDTH - 1:0] q, + input reg [WIDTH - 1:0] q1, + input reg [WIDTH - 1:0] q2, + output wire[WIDTH - 1:0] d ); @@ -17,24 +17,22 @@ module VX_generic_stack reg [WIDTH - 1:0] stack [0:(1 << DEPTH) - 1]; always @(posedge clk) begin - if (reset) - ptr <= 0; - else if (push) - ptr <= ptr + 1; + // if (reset) + // ptr <= 0; + // else + if (push) + ptr <= ptr + 2; else if (pop) ptr <= ptr - 1; end always @(posedge clk) begin if (push) begin - if(push) - stack[ptr] <= q; + stack[ptr] <= q1; + stack[ptr+1] <= q2; end end - always @(*) begin - if (pop) - q <= stack[ptr - 1]; - end + assign d = stack[ptr - 1]; endmodule \ No newline at end of file diff --git a/rtl/VX_gpgpu_inst.v b/rtl/VX_gpgpu_inst.v index 694bc630..e9901625 100644 --- a/rtl/VX_gpgpu_inst.v +++ b/rtl/VX_gpgpu_inst.v @@ -32,7 +32,7 @@ module VX_gpgpu_inst ( // VX_gpu_inst_req.pc genvar curr_s_t; for (curr_s_t = 0; curr_s_t < `NT; curr_s_t=curr_s_t+1) begin - wire curr_bool = (VX_gpu_inst_req.a_reg_data == 32'b1); + wire curr_bool = (VX_gpu_inst_req.a_reg_data[curr_s_t] == 32'b1); assign split_new_use_mask[curr_s_t] = VX_gpu_inst_req.valid[curr_s_t] & (curr_bool); assign split_new_later_mask[curr_s_t] = VX_gpu_inst_req.valid[curr_s_t] & (!curr_bool); @@ -43,7 +43,7 @@ module VX_gpgpu_inst ( always @(*) begin num_valids = 0; for (z = 0; z < `NT; z=z+1) begin - if (VX_gpu_inst_req.valid) num_valids = num_valids + 1 + if (VX_gpu_inst_req.valid[z]) num_valids = num_valids + 1; end end @@ -51,6 +51,7 @@ module VX_gpgpu_inst ( assign VX_warp_ctl.split_new_mask = split_new_use_mask; assign VX_warp_ctl.split_later_mask = split_new_later_mask; assign VX_warp_ctl.split_save_pc = VX_gpu_inst_req.pc_next; + assign VX_warp_ctl.split_warp_num = VX_gpu_inst_req.warp_num; // VX_gpu_inst_req.is_wspawn // VX_gpu_inst_req.is_split diff --git a/rtl/VX_warp_scheduler.v b/rtl/VX_warp_scheduler.v index 40e99b5b..e4f2abd7 100644 --- a/rtl/VX_warp_scheduler.v +++ b/rtl/VX_warp_scheduler.v @@ -21,10 +21,15 @@ module VX_warp_scheduler ( input wire[`NW_M1:0] wstall_warp_num, // Split - input wire is_split, - input wire[`NT_M1:0] split_new_mask, - input wire[`NT_M1:0] split_later_mask, - input wire[31:0] split_save_pc, + input wire is_split, + input wire[`NT_M1:0] split_new_mask, + input wire[`NT_M1:0] split_later_mask, + input wire[31:0] split_save_pc, + input wire[`NW_M1:0] split_warp_num, + + // Join + input wire is_join, + input wire[`NW_M1:0] join_warp_num, // JAL input wire jal, @@ -114,9 +119,20 @@ module VX_warp_scheduler ( visible_active[wstall_warp_num] <= 0; end + if (is_split) begin + warp_stalled[split_warp_num] <= 0; + thread_masks[split_warp_num] <= split_new_mask; + end + + if (is_join) begin + if (!join_fall) begin + warp_pcs[join_warp_num] <= join_pc; + end + thread_masks[join_warp_num] <= join_tm; + end + // Refilling active warps - if ((visible_active == 0) && !(stall || wstall || hazard)) begin - // if ((num_active <= 1) && !(globa)) begin + if ((visible_active == 0) && !(stall || wstall || hazard || is_join)) begin visible_active <= warp_active & (~warp_stalled); end @@ -145,8 +161,36 @@ module VX_warp_scheduler ( end end + wire[(1+32+`NT_M1):0] q1 = {1'b1, warp_pcs[split_warp_num], thread_masks[split_warp_num]}; + wire[(1+32+`NT_M1):0] q2 = {1'b0, split_save_pc , split_later_mask}; + wire[(1+32+`NT_M1):0] d; + + wire join_fall; + wire[31:0] join_pc; + wire[`NT_M1:0] join_tm; + + assign {join_fall, join_pc, join_tm} = d; + + + + genvar curr_warp; + for (curr_warp = 0; curr_warp < `NW; curr_warp = curr_warp + 1) begin + wire correct_warp_s = (curr_warp == split_warp_num); + wire correct_warp_j = (curr_warp == join_warp_num); + + wire push = is_split && correct_warp_s; + wire pop = is_join && correct_warp_j; + VX_generic_stack #(.WIDTH(1+32+`NT), .DEPTH($clog2(`NT))) ipdom_stack( + .clk (clk), + .push (push), + .pop (pop), + .d (d), + .q1 (q1), + .q2 (q2) + ); + end // wire should_stall = stall || (jal && (warp_to_schedule == jal_warp_num)) || (branch_dir && (warp_to_schedule == branch_warp_num)); @@ -157,7 +201,7 @@ module VX_warp_scheduler ( assign real_schedule = schedule && !warp_stalled[warp_to_schedule]; - assign global_stall = (stall || wstall || hazard || !real_schedule); + assign global_stall = (stall || wstall || hazard || !real_schedule || is_join); assign warp_pc = warp_pcs[warp_to_schedule]; diff --git a/rtl/interfaces/VX_gpu_inst_req_inter.v b/rtl/interfaces/VX_gpu_inst_req_inter.v index 66b4c191..8ce4b343 100644 --- a/rtl/interfaces/VX_gpu_inst_req_inter.v +++ b/rtl/interfaces/VX_gpu_inst_req_inter.v @@ -13,7 +13,7 @@ interface VX_gpu_inst_req_inter(); wire is_split; wire is_barrier; - wire pc_next; + wire[31:0] pc_next; wire[`NT_M1:0][31:0] a_reg_data; wire[31:0] rd2; diff --git a/rtl/interfaces/VX_join_inter.v b/rtl/interfaces/VX_join_inter.v new file mode 100644 index 00000000..a465bf65 --- /dev/null +++ b/rtl/interfaces/VX_join_inter.v @@ -0,0 +1,17 @@ + +`include "../VX_define.v" + +`ifndef VX_JOIN_INTER + +`define VX_JOIN_INTER + +interface VX_join_inter (); + + wire is_join; + wire[`NW_M1:0] join_warp_num; + + +endinterface + + +`endif \ No newline at end of file diff --git a/rtl/interfaces/VX_warp_ctl_inter.v b/rtl/interfaces/VX_warp_ctl_inter.v index a25d7aa0..cbd5aafc 100644 --- a/rtl/interfaces/VX_warp_ctl_inter.v +++ b/rtl/interfaces/VX_warp_ctl_inter.v @@ -18,6 +18,7 @@ interface VX_warp_ctl_inter (); wire is_split; + wire[`NW_M1:0] split_warp_num; wire[`NT_M1:0] split_new_mask; wire[`NT_M1:0] split_later_mask; wire[31:0] split_save_pc; diff --git a/rtl/results.txt b/rtl/results.txt index ff40d42b..acfac43f 100644 --- a/rtl/results.txt +++ b/rtl/results.txt @@ -3,5 +3,5 @@ # of forwarding stalls: 0 # of branch stalls: 0 # CPI: 2.08333 -# time to simulate: 6.95312e-310 milliseconds +# time to simulate: 0 milliseconds # GRADE: Failed on test: 4294967295 diff --git a/rtl/simulate/tb_debug.h b/rtl/simulate/tb_debug.h index 6aae22b3..711663cc 100644 --- a/rtl/simulate/tb_debug.h +++ b/rtl/simulate/tb_debug.h @@ -1 +1 @@ -#define VCD_OUTPUT +#define VCD_OFF diff --git a/syn/syn.tcl b/syn/syn.tcl index be8d09a1..fb5f6e33 100755 --- a/syn/syn.tcl +++ b/syn/syn.tcl @@ -3,7 +3,7 @@ set link_library [concat * sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_ set symbol_library {} set target_library [concat sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_m40c.db] -set verilog_files [ list VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \ +set verilog_files [ list VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \ ] analyze -format sverilog $verilog_files