From 8318aff69f70ecd34aab05667effcf525825c2e4 Mon Sep 17 00:00:00 2001 From: wgulian3 Date: Thu, 13 Feb 2020 13:17:46 -0500 Subject: [PATCH] Support exec multi-cycle for div/mul --- rtl/Makefile | 4 ++-- rtl/VX_alu.v | 42 ++++++++++++++++++++++++++++------- rtl/VX_back_end.v | 11 +++++++--- rtl/VX_execute_unit.v | 20 +++++++++++++---- rtl/VX_gpr_stage.v | 16 +++++++++----- rtl/VX_scheduler.v | 11 ++++++++-- rtl/VX_writeback.v | 51 +++++++++++++++---------------------------- rtl/Vortex.v | 3 +++ 8 files changed, 100 insertions(+), 58 deletions(-) diff --git a/rtl/Makefile b/rtl/Makefile index 4f511fdd..8279f1c9 100644 --- a/rtl/Makefile +++ b/rtl/Makefile @@ -3,7 +3,7 @@ all: RUNFILE # /rf2_256x128_wm1/ BaseMEM=../models/memory/cln28hpm -INCLUDE=-I. -Ishared_memory -Icache -I$(BaseMEM)/rf2_128x128_wm1/ -I$(BaseMEM)/rf2_256x128_wm1/ -I$(BaseMEM)/rf2_256x19_wm0/ -I$(BaseMEM)/rf2_32x128_wm1/ -Iinterfaces/ -Ipipe_regs/ -Isimulate +INCLUDE=-I. -Ishared_memory -Icache -I$(BaseMEM)/rf2_128x128_wm1/ -I$(BaseMEM)/rf2_256x128_wm1/ -I$(BaseMEM)/rf2_256x19_wm0/ -I$(BaseMEM)/rf2_32x128_wm1/ -Iinterfaces/ -Ipipe_regs/ -Icompat/ -Isimulate FILE=Vortex.v @@ -49,4 +49,4 @@ w: VERILATORnoWarnings $(MAKECPP) clean: - rm obj_dir/* \ No newline at end of file + rm obj_dir/* diff --git a/rtl/VX_alu.v b/rtl/VX_alu.v index e4e7a86f..633a87ab 100644 --- a/rtl/VX_alu.v +++ b/rtl/VX_alu.v @@ -1,6 +1,8 @@ `include "VX_define.v" module VX_alu( + input wire clk, + input wire reset, input wire[31:0] in_1, input wire[31:0] in_2, input wire in_rs2_src, @@ -8,9 +10,11 @@ module VX_alu( input wire[19:0] in_upper_immed, input wire[4:0] in_alu_op, input wire[31:0] in_curr_PC, - output reg[31:0] out_alu_result + output reg[31:0] out_alu_result, + output reg out_alu_stall ); + localparam div_pipeline_len = 3; `ifdef SYN_FUNC wire which_in2; @@ -25,23 +29,25 @@ module VX_alu( wire[31:0] signed_div_result; wire[31:0] signed_rem_result; + reg [15:0] inst_delay; + reg [15:0] inst_delay_count; + + assign out_alu_stall = inst_delay != 0 || inst_delay_count != 0; assign which_in2 = in_rs2_src == `RS2_IMMED; assign ALU_in1 = in_1; - assign ALU_in2 = which_in2 ? in_itype_immed : in_2; - assign upper_immed = {in_upper_immed, {12{1'b0}}}; VX_divide #( .WIDTHN(32), .WIDTHD(32), .SPEED("HIGHEST"), - .PIPELINE(0) + .PIPELINE(div_pipeline_len) ) unsigned_div ( - .clk(0), + .clock(clk), .aclr(0), .clken(1), // TODO this could be disabled on inactive instructions .numer(ALU_in1), @@ -56,9 +62,9 @@ module VX_alu( .NREP("SIGNED"), .DREP("SIGNED"), .SPEED("HIGHEST"), - .PIPELINE(0) + .PIPELINE(div_pipeline_len) ) signed_div ( - .clk(0), + .clock(clk), .aclr(0), .clken(1), // TODO this could be disabled on inactive instructions .numer(ALU_in1), @@ -101,6 +107,7 @@ module VX_alu( `MULH: out_alu_result = mult_result[63:32]; `MULHSU: out_alu_result = mult_result[63:32]; `MULHU: out_alu_result = mult_result[63:32]; + // TODO profitable to roll these exceptional cases into inst_delay to avoid pipeline when possible? `DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : signed_div_result; `DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : unsigned_div_result; `REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : signed_rem_result; @@ -109,6 +116,25 @@ module VX_alu( endcase // in_alu_op end + always @(*) begin + case(in_alu_op) + `DIV, + `DIVU, + `REM, + `REMU: inst_delay = div_pipeline_len; + default: inst_delay = 0; + endcase // in_alu_op + end + + always @(posedge clk or posedge reset) begin + if (reset) + inst_delay_count <= 0; + else if (inst_delay_count > 0) + inst_delay_count <= inst_delay_count - 1; + else if (inst_delay != 0) + inst_delay_count <= inst_delay - 1; + end + `else wire which_in2; @@ -169,4 +195,4 @@ module VX_alu( end `endif -endmodule \ No newline at end of file +endmodule : VX_alu \ No newline at end of file diff --git a/rtl/VX_back_end.v b/rtl/VX_back_end.v index a58847f3..b9c7b1a7 100644 --- a/rtl/VX_back_end.v +++ b/rtl/VX_back_end.v @@ -6,6 +6,7 @@ module VX_back_end ( input wire schedule_delay, output wire out_mem_delay, + output wire out_exec_delay, output wire gpr_stage_delay, VX_jal_response_inter VX_jal_rsp, VX_branch_response_inter VX_branch_rsp, @@ -32,7 +33,7 @@ assign VX_writeback_inter.wb_warp_num = VX_writeback_temp.wb_warp_num; VX_mw_wb_inter VX_mw_wb(); -wire no_slot_mem; +wire no_slot_mem, no_slot_exec; VX_mem_req_inter VX_exe_mem_req(); @@ -69,6 +70,7 @@ VX_gpr_stage VX_gpr_stage( .VX_csr_req (VX_csr_req), // End new .memory_delay (out_mem_delay), + .exec_delay (out_exec_delay), .gpr_stage_delay (gpr_stage_delay) ); @@ -91,7 +93,9 @@ VX_execute_unit VX_execUnit( .VX_exec_unit_req(VX_exec_unit_req), .VX_inst_exec_wb (VX_inst_exec_wb), .VX_jal_rsp (VX_jal_rsp), - .VX_branch_rsp (VX_branch_rsp) + .VX_branch_rsp (VX_branch_rsp), + .out_delay (out_exec_delay), + .no_slot_exec (no_slot_exec) ); @@ -113,7 +117,8 @@ VX_writeback VX_wb( .VX_csr_wb (VX_csr_wb), .VX_writeback_inter(VX_writeback_temp), - .no_slot_mem (no_slot_mem) + .no_slot_mem (no_slot_mem), + .no_slot_exec (no_slot_exec) ); endmodule \ No newline at end of file diff --git a/rtl/VX_execute_unit.v b/rtl/VX_execute_unit.v index 8c9f4f4d..697c20cb 100644 --- a/rtl/VX_execute_unit.v +++ b/rtl/VX_execute_unit.v @@ -12,7 +12,10 @@ module VX_execute_unit ( // JAL Response VX_jal_response_inter VX_jal_rsp, // Branch Response - VX_branch_response_inter VX_branch_rsp + VX_branch_response_inter VX_branch_rsp, + + input wire no_slot_exec, + output wire out_delay ); @@ -41,10 +44,13 @@ module VX_execute_unit ( wire[`NT_M1:0][31:0] alu_result; + wire[`NT_M1:0] alu_stall; genvar index_out_reg; generate for (index_out_reg = 0; index_out_reg < `NT; index_out_reg = index_out_reg + 1) begin : alu_defs VX_alu vx_alu( + .clk(clk), + .reset(reset), // .in_reg_data (in_reg_data[1:0]), .in_1 (in_a_reg_data[index_out_reg]), .in_2 (in_b_reg_data[index_out_reg]), @@ -53,11 +59,17 @@ module VX_execute_unit ( .in_upper_immed(in_upper_immed), .in_alu_op (in_alu_op), .in_curr_PC (in_curr_PC), - .out_alu_result(alu_result[index_out_reg]) + .out_alu_result(alu_result[index_out_reg]), + .out_alu_stall(alu_stall[index_out_reg]) ); end endgenerate + wire internal_stall; + assign internal_stall = |alu_stall; + + assign out_delay = no_slot_exec || internal_stall; + wire [$clog2(`NT)-1:0] jal_branch_use_index; wire jal_branch_found_valid; @@ -103,7 +115,7 @@ module VX_execute_unit ( // Actual Writeback assign VX_inst_exec_wb.rd = VX_exec_unit_req.rd; assign VX_inst_exec_wb.wb = VX_exec_unit_req.wb; - assign VX_inst_exec_wb.wb_valid = VX_exec_unit_req.valid; + assign VX_inst_exec_wb.wb_valid = VX_exec_unit_req.valid && !internal_stall; assign VX_inst_exec_wb.wb_warp_num = VX_exec_unit_req.warp_num; assign VX_inst_exec_wb.alu_result = VX_exec_unit_req.jal ? duplicate_PC_data : alu_result; @@ -163,4 +175,4 @@ module VX_execute_unit ( // assign out_is_csr = VX_exec_unit_req.is_csr; // assign out_csr_address = VX_exec_unit_req.csr_address; -endmodule \ No newline at end of file +endmodule : VX_execute_unit \ No newline at end of file diff --git a/rtl/VX_gpr_stage.v b/rtl/VX_gpr_stage.v index 3d556a83..386735cf 100644 --- a/rtl/VX_gpr_stage.v +++ b/rtl/VX_gpr_stage.v @@ -7,6 +7,7 @@ module VX_gpr_stage ( input wire schedule_delay, input wire memory_delay, + input wire exec_delay, output wire gpr_stage_delay, // inputs @@ -93,7 +94,10 @@ module VX_gpr_stage ( wire stall_lsu = memory_delay; wire flush_lsu = schedule_delay && !stall_lsu; - assign gpr_stage_delay = stall_lsu; + wire stall_exec = exec_delay; + wire flush_exec = schedule_delay && !stall_exec; + + assign gpr_stage_delay = stall_lsu || stall_exec; `ifdef ASIC wire delayed_lsu_last_cycle; @@ -145,8 +149,8 @@ module VX_gpr_stage ( VX_generic_register #(.N(224 + `NW_M1 + 1 + (`NT))) exec_unit_reg( .clk (clk), .reset(reset), - .stall(stall_rest), - .flush(flush_rest), + .stall(stall_exec), + .flush(flush_exec), .in ({VX_exec_unit_req_temp.valid, VX_exec_unit_req_temp.warp_num, VX_exec_unit_req_temp.curr_PC, VX_exec_unit_req_temp.PC_next, VX_exec_unit_req_temp.rd, VX_exec_unit_req_temp.wb, VX_exec_unit_req_temp.alu_op, VX_exec_unit_req_temp.rs1, VX_exec_unit_req_temp.rs2, VX_exec_unit_req_temp.rs2_src, VX_exec_unit_req_temp.itype_immed, VX_exec_unit_req_temp.upper_immed, VX_exec_unit_req_temp.branch_type, VX_exec_unit_req_temp.jalQual, VX_exec_unit_req_temp.jal, VX_exec_unit_req_temp.jal_offset, VX_exec_unit_req_temp.ebreak, VX_exec_unit_req_temp.wspawn, VX_exec_unit_req_temp.is_csr, VX_exec_unit_req_temp.csr_address, VX_exec_unit_req_temp.csr_immed, VX_exec_unit_req_temp.csr_mask}), .out ({VX_exec_unit_req.valid , VX_exec_unit_req.warp_num , VX_exec_unit_req.curr_PC , VX_exec_unit_req.PC_next , VX_exec_unit_req.rd , VX_exec_unit_req.wb , VX_exec_unit_req.alu_op , VX_exec_unit_req.rs1 , VX_exec_unit_req.rs2 , VX_exec_unit_req.rs2_src , VX_exec_unit_req.itype_immed , VX_exec_unit_req.upper_immed , VX_exec_unit_req.branch_type , VX_exec_unit_req.jalQual , VX_exec_unit_req.jal , VX_exec_unit_req.jal_offset , VX_exec_unit_req.ebreak , VX_exec_unit_req.wspawn , VX_exec_unit_req.is_csr , VX_exec_unit_req.csr_address , VX_exec_unit_req.csr_immed , VX_exec_unit_req.csr_mask }) ); @@ -193,8 +197,8 @@ module VX_gpr_stage ( VX_generic_register #(.N(224 + `NW_M1 + 1 + 65*(`NT))) exec_unit_reg( .clk (clk), .reset(reset), - .stall(stall_rest), - .flush(flush_rest), + .stall(stall_exec), + .flush(flush_exec), .in ({VX_exec_unit_req_temp.valid, VX_exec_unit_req_temp.warp_num, VX_exec_unit_req_temp.curr_PC, VX_exec_unit_req_temp.PC_next, VX_exec_unit_req_temp.rd, VX_exec_unit_req_temp.wb, VX_exec_unit_req_temp.a_reg_data, VX_exec_unit_req_temp.b_reg_data, VX_exec_unit_req_temp.alu_op, VX_exec_unit_req_temp.rs1, VX_exec_unit_req_temp.rs2, VX_exec_unit_req_temp.rs2_src, VX_exec_unit_req_temp.itype_immed, VX_exec_unit_req_temp.upper_immed, VX_exec_unit_req_temp.branch_type, VX_exec_unit_req_temp.jalQual, VX_exec_unit_req_temp.jal, VX_exec_unit_req_temp.jal_offset, VX_exec_unit_req_temp.ebreak, VX_exec_unit_req_temp.wspawn, VX_exec_unit_req_temp.is_csr, VX_exec_unit_req_temp.csr_address, VX_exec_unit_req_temp.csr_immed, VX_exec_unit_req_temp.csr_mask}), .out ({VX_exec_unit_req.valid , VX_exec_unit_req.warp_num , VX_exec_unit_req.curr_PC , VX_exec_unit_req.PC_next , VX_exec_unit_req.rd , VX_exec_unit_req.wb , VX_exec_unit_req.a_reg_data , VX_exec_unit_req.b_reg_data , VX_exec_unit_req.alu_op , VX_exec_unit_req.rs1 , VX_exec_unit_req.rs2 , VX_exec_unit_req.rs2_src , VX_exec_unit_req.itype_immed , VX_exec_unit_req.upper_immed , VX_exec_unit_req.branch_type , VX_exec_unit_req.jalQual , VX_exec_unit_req.jal , VX_exec_unit_req.jal_offset , VX_exec_unit_req.ebreak , VX_exec_unit_req.wspawn , VX_exec_unit_req.is_csr , VX_exec_unit_req.csr_address , VX_exec_unit_req.csr_immed , VX_exec_unit_req.csr_mask }) ); @@ -219,4 +223,4 @@ module VX_gpr_stage ( `endif -endmodule \ No newline at end of file +endmodule : VX_gpr_stage \ No newline at end of file diff --git a/rtl/VX_scheduler.v b/rtl/VX_scheduler.v index ce54db63..da9962a6 100644 --- a/rtl/VX_scheduler.v +++ b/rtl/VX_scheduler.v @@ -6,6 +6,7 @@ module VX_scheduler ( input wire clk, input wire reset, input wire memory_delay, + input wire exec_delay, input wire gpr_stage_delay, VX_frE_to_bckE_req_inter VX_bckE_req, VX_wb_inter VX_writeback_inter, @@ -27,7 +28,11 @@ module VX_scheduler ( wire is_store = (VX_bckE_req.mem_write != `NO_MEM_WRITE); wire is_load = (VX_bckE_req.mem_read != `NO_MEM_READ); + // classify our next instruction. wire is_mem = is_store || is_load; + wire is_gpu = (VX_bckE_req.is_wspawn || VX_bckE_req.is_tmc || VX_bckE_req.is_barrier || VX_bckE_req.is_split); + wire is_csr = VX_bckE_req.is_csr; + wire is_exec = !is_mem && !is_gpu && !is_csr; wire rs1_pass = ((valid_wb && (VX_writeback_inter.rd == VX_bckE_req.rs1))); @@ -44,8 +49,10 @@ module VX_scheduler ( wire rename_valid = rs1_rename_qual || rs2_rename_qual ; - - assign schedule_delay = ((rename_valid) && (|VX_bckE_req.valid)) || (memory_delay && (is_mem)) || (gpr_stage_delay && is_mem); + assign schedule_delay = ((rename_valid) && (|VX_bckE_req.valid)) + || (memory_delay && is_mem) + || (gpr_stage_delay && (is_mem || is_exec)) + || (exec_delay && is_exec); integer i; integer w; diff --git a/rtl/VX_writeback.v b/rtl/VX_writeback.v index c9616d43..ab9ebf0c 100644 --- a/rtl/VX_writeback.v +++ b/rtl/VX_writeback.v @@ -14,10 +14,10 @@ module VX_writeback ( // Actual WB to GPR VX_wb_inter VX_writeback_inter, - output wire no_slot_mem + output wire no_slot_mem, + output wire no_slot_exec ); - VX_wb_inter VX_writeback_tempp(); wire exec_wb = (VX_inst_exec_wb.wb != 0) && (|VX_inst_exec_wb.wb_valid); @@ -25,38 +25,39 @@ module VX_writeback ( wire csr_wb = (VX_csr_wb.wb != 0) && (|VX_csr_wb.valid); - assign no_slot_mem = mem_wb && (exec_wb || csr_wb); + assign no_slot_mem = mem_wb && (exec_wb || csr_wb); + assign no_slot_exec = exec_wb && (csr_wb); - assign VX_writeback_tempp.write_data = exec_wb ? VX_inst_exec_wb.alu_result : - csr_wb ? VX_csr_wb.csr_result : + assign VX_writeback_tempp.write_data = csr_wb ? VX_csr_wb.csr_result : + exec_wb ? VX_inst_exec_wb.alu_result : mem_wb ? VX_mem_wb.loaded_data : 0; - assign VX_writeback_tempp.wb_valid = exec_wb ? VX_inst_exec_wb.wb_valid : - csr_wb ? VX_csr_wb.valid : + assign VX_writeback_tempp.wb_valid = csr_wb ? VX_csr_wb.valid : + exec_wb ? VX_inst_exec_wb.wb_valid : mem_wb ? VX_mem_wb.wb_valid : 0; - assign VX_writeback_tempp.rd = exec_wb ? VX_inst_exec_wb.rd : - csr_wb ? VX_csr_wb.rd : + assign VX_writeback_tempp.rd = csr_wb ? VX_csr_wb.rd : + exec_wb ? VX_inst_exec_wb.rd : mem_wb ? VX_mem_wb.rd : 0; - assign VX_writeback_tempp.wb = exec_wb ? VX_inst_exec_wb.wb : - csr_wb ? VX_csr_wb.wb : + assign VX_writeback_tempp.wb = csr_wb ? VX_csr_wb.wb : + exec_wb ? VX_inst_exec_wb.wb : mem_wb ? VX_mem_wb.wb : 0; - assign VX_writeback_tempp.wb_warp_num = exec_wb ? VX_inst_exec_wb.wb_warp_num : - csr_wb ? VX_csr_wb.warp_num : + assign VX_writeback_tempp.wb_warp_num = csr_wb ? VX_csr_wb.warp_num : + exec_wb ? VX_inst_exec_wb.wb_warp_num : mem_wb ? VX_mem_wb.wb_warp_num : 0; - assign VX_writeback_tempp.wb_pc = exec_wb ? VX_inst_exec_wb.exec_wb_pc : - csr_wb ? 32'hdeadbeef : + assign VX_writeback_tempp.wb_pc = csr_wb ? 32'hdeadbeef : + exec_wb ? VX_inst_exec_wb.exec_wb_pc : mem_wb ? VX_mem_wb.mem_wb_pc : 32'hdeadbeef; @@ -65,17 +66,6 @@ module VX_writeback ( wire[`NT-1:0][31:0] use_wb_data; - reg prev_is_mem; - - always @(posedge clk, posedge reset) begin - if (reset) - begin - prev_is_mem = 0; - end begin - prev_is_mem = mem_wb && !no_slot_mem; - end - end - VX_generic_register #(.N(39 + `NW_M1 + 1 + `NT*33)) wb_register( .clk (clk), .reset(reset), @@ -85,14 +75,9 @@ module VX_writeback ( .out ({use_wb_data , VX_writeback_inter.wb_valid, VX_writeback_inter.rd, VX_writeback_inter.wb, VX_writeback_inter.wb_warp_num, VX_writeback_inter.wb_pc}) ); - `ifdef SYN - assign VX_writeback_inter.write_data = prev_is_mem ? VX_writeback_tempp.write_data : use_wb_data; - `else - assign VX_writeback_inter.write_data = use_wb_data; - `endif + assign VX_writeback_inter.write_data = use_wb_data; - -endmodule // VX_writeback +endmodule : VX_writeback // VX_writeback diff --git a/rtl/Vortex.v b/rtl/Vortex.v index d70b88f6..35e62f47 100644 --- a/rtl/Vortex.v +++ b/rtl/Vortex.v @@ -46,6 +46,7 @@ module Vortex wire memory_delay; +wire exec_delay; wire gpr_stage_delay; wire schedule_delay; @@ -179,6 +180,7 @@ VX_scheduler schedule( .clk (clk), .reset (reset), .memory_delay (memory_delay), + .exec_delay (exec_delay), .gpr_stage_delay (gpr_stage_delay), .VX_bckE_req (VX_bckE_req), .VX_writeback_inter(VX_writeback_inter), @@ -197,6 +199,7 @@ VX_back_end vx_back_end( .VX_dcache_req (VX_dcache_req), .VX_writeback_inter (VX_writeback_inter), .out_mem_delay (memory_delay), + .out_exec_delay (exec_delay), .gpr_stage_delay (gpr_stage_delay) );