diff --git a/kernel/vortex_test.elf b/kernel/vortex_test.elf index 27379902..ad965d57 100755 Binary files a/kernel/vortex_test.elf and b/kernel/vortex_test.elf differ diff --git a/kernel/vx_os/vx_back/vx_back.s b/kernel/vx_os/vx_back/vx_back.s index 1421181d..48656abf 100644 --- a/kernel/vx_os/vx_back/vx_back.s +++ b/kernel/vx_os/vx_back/vx_back.s @@ -8,7 +8,7 @@ _start: # li a0, 4 # la a1, SPAWN -# .word 0x00b5106b +# .word 0x00b5106b # wspawn a0(numWarps), a1(PC SPAWN) # j SPAWN # nop # nop @@ -21,8 +21,10 @@ _start: # SPAWN: # li a2, 7 # li a0, 0 +# li a1, 4 +# .word 0x00b5406b # barrier a0(barrier id), a1(numWarps) # .word 0x0005006b # tmc a0 - ########################### + ########################## # li a0, 4 # .word 0x0005006b # tmc a0 # csrr a1, 0x20 # read thread IDs diff --git a/rtl/VX_define.v b/rtl/VX_define.v index 344f6129..eb95bddd 100644 --- a/rtl/VX_define.v +++ b/rtl/VX_define.v @@ -10,6 +10,8 @@ // `define ONLY +`define NUM_BARRIERS 4 + `define R_INST 7'd51 `define L_INST 7'd3 `define ALU_INST 7'd19 diff --git a/rtl/VX_fetch.v b/rtl/VX_fetch.v index 62276d2c..0f707061 100644 --- a/rtl/VX_fetch.v +++ b/rtl/VX_fetch.v @@ -30,6 +30,12 @@ module VX_fetch ( .clk (clk), .reset (reset), .stall (pipe_stall), + + .is_barrier (VX_warp_ctl.is_barrier), + .barrier_id (VX_warp_ctl.barrier_id), + .num_warps (VX_warp_ctl.num_warps), + .barrier_warp_num (VX_warp_ctl.warp_num), + // Wspawn .wspawn (VX_warp_ctl.wspawn), .wsapwn_pc (VX_warp_ctl.wspawn_pc), diff --git a/rtl/VX_gpgpu_inst.v b/rtl/VX_gpgpu_inst.v index 9cf8333f..c468dc39 100644 --- a/rtl/VX_gpgpu_inst.v +++ b/rtl/VX_gpgpu_inst.v @@ -37,6 +37,9 @@ module VX_gpgpu_inst ( end + assign VX_warp_ctl.is_barrier = VX_gpu_inst_req.is_barrier && valid_inst; + assign VX_warp_ctl.barrier_id = VX_gpu_inst_req.a_reg_data[0]; + assign VX_warp_ctl.num_warps = VX_gpu_inst_req.rd2 - 1; assign VX_warp_ctl.wspawn = wspawn; assign VX_warp_ctl.wspawn_pc = wspawn_pc; diff --git a/rtl/VX_warp_scheduler.v b/rtl/VX_warp_scheduler.v index d37825c3..d5e09584 100644 --- a/rtl/VX_warp_scheduler.v +++ b/rtl/VX_warp_scheduler.v @@ -18,6 +18,11 @@ module VX_warp_scheduler ( input wire whalt, input wire[`NW_M1:0] whalt_warp_num, + input wire is_barrier, + input wire[31:0] barrier_id, + input wire[`NW_M1:0] num_warps, + input wire[`NW_M1:0] barrier_warp_num, + // WSTALL input wire wstall, input wire[`NW_M1:0] wstall_warp_num, @@ -72,10 +77,16 @@ module VX_warp_scheduler ( reg[`NW-1:0] visible_active; wire[`NW-1:0] use_active; + wire wstall_this_cycle; reg[`NT_M1:0] thread_masks[`NW-1:0]; reg[31:0] warp_pcs[`NW-1:0]; + // barriers + reg[`NW-1:0] barrier_stall_mask[(`NUM_BARRIERS-1):0]; + wire reached_barrier_limit; + wire[`NW-1:0] curr_barrier_mask; + wire[($clog2(`NUM_BARRIERS)-1):0] curr_barrier_count; // wsapwn reg[31:0] use_wsapwn_pc; @@ -91,27 +102,23 @@ module VX_warp_scheduler ( wire[31:0] new_pc; + reg[`NW-1:0] total_barrier_stall; + /* verilator lint_off UNUSED */ wire[`NW_M1:0] num_active; /* verilator lint_on UNUSED */ - reg[1:0] start; - // initial begin - // warp_pcs[0] = (32'h80000000 - 4); - // start = 0; - // warp_active[0] = 1; // Activating first warp - // visible_active[0] = 1; // Activating first warp - // thread_masks[0][0] = 1; // Activating first thread in first warp - // end - integer curr_w_help; always @(posedge clk or posedge reset) begin if (reset) begin - start <= 0; - warp_pcs[0] <= (32'h80000000 - 4); - warp_active[0] <= 1; // Activating first warp - visible_active[0] <= 1; // Activating first warp - thread_masks[0] <= 1; // Activating first thread in first warp + barrier_stall_mask[0] <= 0; + barrier_stall_mask[1] <= 0; + use_wsapwn_pc <= 0; + use_wsapwn <= 0; + warp_pcs[0] <= (32'h80000000 - 4); + warp_active[0] <= 1; // Activating first warp + visible_active[0] <= 1; // Activating first warp + thread_masks[0] <= 1; // Activating first thread in first warp for (curr_w_help = 1; curr_w_help < `NW; curr_w_help=curr_w_help+1) begin warp_pcs[curr_w_help] <= 0; warp_active[curr_w_help] <= 0; // Activating first warp @@ -127,6 +134,15 @@ module VX_warp_scheduler ( use_wsapwn <= wspawn_new_active & (~`NW'b1); end + if (is_barrier) begin + warp_stalled[barrier_warp_num] <= 0; + if (reached_barrier_limit) begin + barrier_stall_mask[barrier_id] <= 0; + end else begin + barrier_stall_mask[barrier_id][barrier_warp_num] <= 1; + end + end + if (update_use_wspawn) begin use_wsapwn[warp_to_schedule] <= 0; end @@ -162,7 +178,7 @@ module VX_warp_scheduler ( // Refilling active warps if (update_visible_active) begin - visible_active <= warp_active & (~warp_stalled); + visible_active <= warp_active & (~warp_stalled) & (~total_barrier_stall); end // Don't change state if stall @@ -185,8 +201,23 @@ module VX_warp_scheduler ( end end + assign curr_barrier_mask = barrier_stall_mask[barrier_id][`NW-1:0]; + assign curr_barrier_count = $countones(curr_barrier_mask); + assign reached_barrier_limit = curr_barrier_count == (num_warps); - assign update_visible_active = ($countones(visible_active) < 1) && !(stall || wstall || hazard || is_join); + assign wstall_this_cycle = wstall && (wstall_warp_num == warp_to_schedule); // Maybe bug + + genvar curr_b; + always @(*) begin + total_barrier_stall = 0; + for (curr_b = 0; curr_b < `NUM_BARRIERS; curr_b=curr_b+1) + begin + total_barrier_stall[`NW-1:0] = total_barrier_stall[`NW-1:0] | barrier_stall_mask[curr_b[($clog2(`NUM_BARRIERS)-1):0]][`NW-1:0]; + end + end + + + assign update_visible_active = ($countones(visible_active) < 1) && !(stall || wstall_this_cycle || hazard || is_join); wire[(1+32+`NT_M1):0] q1 = {1'b1, 32'b0 , thread_masks[split_warp_num]}; wire[(1+32+`NT_M1):0] q2 = {1'b0, split_save_pc , split_later_mask}; @@ -221,9 +252,9 @@ module VX_warp_scheduler ( assign hazard = (should_jal || should_bra) && schedule; - assign real_schedule = schedule && !warp_stalled[warp_to_schedule]; + assign real_schedule = schedule && !warp_stalled[warp_to_schedule] && !total_barrier_stall[warp_to_schedule]; - assign global_stall = (stall || wstall || hazard || !real_schedule || is_join); + assign global_stall = (stall || wstall_this_cycle || hazard || !real_schedule || is_join); wire real_use_wspawn = use_wsapwn[warp_to_schedule]; @@ -237,7 +268,7 @@ module VX_warp_scheduler ( assign new_pc = warp_pc + 4; - assign use_active = (num_active < 1) ? (warp_active & (~warp_stalled)) : visible_active; + assign use_active = (num_active < 1) ? (warp_active & (~warp_stalled) & (~total_barrier_stall)) : visible_active; // Choosing a warp to schedule VX_priority_encoder choose_schedule( diff --git a/rtl/interfaces/VX_gpu_inst_req_inter.v b/rtl/interfaces/VX_gpu_inst_req_inter.v index 8ce4b343..1d24c960 100644 --- a/rtl/interfaces/VX_gpu_inst_req_inter.v +++ b/rtl/interfaces/VX_gpu_inst_req_inter.v @@ -11,6 +11,7 @@ interface VX_gpu_inst_req_inter(); wire is_wspawn; wire is_tmc; wire is_split; + wire is_barrier; wire[31:0] pc_next; diff --git a/rtl/interfaces/VX_warp_ctl_inter.v b/rtl/interfaces/VX_warp_ctl_inter.v index 84979972..a6697de9 100644 --- a/rtl/interfaces/VX_warp_ctl_inter.v +++ b/rtl/interfaces/VX_warp_ctl_inter.v @@ -17,6 +17,10 @@ interface VX_warp_ctl_inter (); wire ebreak; + // barrier + wire is_barrier; + wire[31:0] barrier_id; + wire[`NW_M1:0] num_warps; wire is_split; wire[`NW_M1:0] split_warp_num;