diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 38fe3d60..60061f46 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -16,12 +16,12 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO -#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 +CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1 +#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DEBUG=1 +#DEBUG=1 #AFU=1 CFLAGS += -fPIC diff --git a/hw/opae/Makefile b/hw/opae/Makefile index eddd6849..339f9d37 100644 --- a/hw/opae/Makefile +++ b/hw/opae/Makefile @@ -4,13 +4,16 @@ FPGA_BUILD_DIR=build_fpga all: ase-1c -ase-1c: setup-ase-1c +sources.txt: + ./gen_sources.sh + +ase-1c: setup-ase-1c sources.txt make -C $(ASE_BUILD_DIR)_1c -ase-2c: setup-ase-2c +ase-2c: setup-ase-2c sources.txt make -C $(ASE_BUILD_DIR)_2c -ase-4c: setup-ase-4c +ase-4c: setup-ase-4c sources.txt make -C $(ASE_BUILD_DIR)_4c setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile @@ -28,13 +31,13 @@ $(ASE_BUILD_DIR)_2c/Makefile: $(ASE_BUILD_DIR)_4c/Makefile: afu_sim_setup -s sources_4c.txt $(ASE_BUILD_DIR)_4c -fpga-1c: setup-fpga-1c +fpga-1c: setup-fpga-1c sources.txt cd $(FPGA_BUILD_DIR)_1c && qsub-synth -fpga-2c: setup-fpga-2c +fpga-2c: setup-fpga-2c sources.txt cd $(FPGA_BUILD_DIR)_2c && qsub-synth -fpga-4c: setup-fpga-4c +fpga-4c: setup-fpga-4c sources.txt cd $(FPGA_BUILD_DIR)_4c && qsub-synth setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf diff --git a/hw/opae/README b/hw/opae/README index 86e6f862..f480cc92 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -60,8 +60,8 @@ qsub-sim make ase # tests -./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -./run_ase.sh build_ase_1c ../../driver/tests/demo/demo +./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n 256 +./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16 ./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd # modify "vsim_run.tcl" to dump VCD trace diff --git a/hw/opae/gen_sources.sh b/hw/opae/gen_sources.sh new file mode 100755 index 00000000..deb8cdbb --- /dev/null +++ b/hw/opae/gen_sources.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +dir_list='../rtl/libs ../rtl/cache ../rtl/interfaces ../rtl' + +inc_list="" +for dir in $dir_list; do + inc_list="$inc_list -I$dir" +done + +echo "inc_list=$inc_list" + +{ + # read design sources + for dir in $dir_list; do + echo "+incdir+$dir" + for file in $(find $dir -maxdepth 1 -name '*.v' -o -name '*.sv' -type f) + do + echo $file + done + done +} > sources.txt \ No newline at end of file diff --git a/hw/opae/sources.txt b/hw/opae/sources.txt index 0b448e88..ae14d127 100644 --- a/hw/opae/sources.txt +++ b/hw/opae/sources.txt @@ -1,46 +1,34 @@ -vortex_afu.json - -QI:vortex_afu.qsf - -#+define+SCOPE - -#+define+DBG_PRINT_CORE_ICACHE -#+define+DBG_PRINT_CORE_DCACHE -#+define+DBG_PRINT_CACHE_BANK -#+define+DBG_PRINT_CACHE_SNP -#+define+DBG_PRINT_CACHE_MSRQ -#+define+DBG_PRINT_DRAM -#+define+DBG_PRINT_PIPELINE -#+define+DBG_PRINT_OPAE -#+define+DBG_PRINT_SCOPE - -+incdir+. -+incdir+../rtl -+incdir+../rtl/interfaces -+incdir+../rtl/pipe_regs -+incdir+../rtl/cache +incdir+../rtl/libs - -../rtl/VX_user_config.vh -../rtl/VX_config.vh -../rtl/VX_define.vh - -../rtl/cache/VX_cache_config.vh -../rtl/cache/VX_cache.v -../rtl/cache/VX_cache_core_rsp_merge.v -../rtl/cache/VX_cache_core_req_bank_sel.v -../rtl/cache/VX_cache_dram_req_arb.v -../rtl/cache/VX_cache_dram_fill_arb.v -../rtl/cache/VX_cache_miss_resrv.v +../rtl/libs/VX_countones.v +../rtl/libs/VX_divide.v +../rtl/libs/VX_fair_arbiter.v +../rtl/libs/VX_fixed_arbiter.v +../rtl/libs/VX_generic_queue.v +../rtl/libs/VX_generic_register.v +../rtl/libs/VX_generic_stack.v +../rtl/libs/VX_index_queue.v +../rtl/libs/VX_matrix_arbiter.v +../rtl/libs/VX_mult.v +../rtl/libs/VX_priority_encoder.v +../rtl/libs/VX_rr_arbiter.v +../rtl/libs/VX_onehot_encooder.v ++incdir+../rtl/cache ../rtl/cache/VX_bank.v ../rtl/cache/VX_bank_core_req_arb.v +../rtl/cache/VX_cache.v +../rtl/cache/VX_cache_core_req_bank_sel.v +../rtl/cache/VX_cache_core_rsp_merge.v +../rtl/cache/VX_cache_dram_fill_arb.v +../rtl/cache/VX_cache_dram_req_arb.v +../rtl/cache/VX_cache_miss_resrv.v +../rtl/cache/VX_prefetcher.v +../rtl/cache/VX_snp_forwarder.v ../rtl/cache/VX_snp_rsp_arb.v ../rtl/cache/VX_tag_data_access.v ../rtl/cache/VX_tag_data_structure.v -../rtl/cache/VX_snp_forwarder.v -../rtl/cache/VX_prefetcher.v - -../rtl/interfaces/VX_branch_rsp_if.v ++incdir+../rtl/interfaces +../rtl/interfaces/VX_alu_req_if.v +../rtl/interfaces/VX_branch_ctl_if.v ../rtl/interfaces/VX_cache_core_req_if.v ../rtl/interfaces/VX_cache_core_rsp_if.v ../rtl/interfaces/VX_cache_dram_req_if.v @@ -48,65 +36,46 @@ QI:vortex_afu.qsf ../rtl/interfaces/VX_cache_snp_req_if.v ../rtl/interfaces/VX_cache_snp_rsp_if.v ../rtl/interfaces/VX_csr_req_if.v +../rtl/interfaces/VX_commit_if.v ../rtl/interfaces/VX_csr_io_req_if.v -../rtl/interfaces/VX_csr_io_rsp_if.v -../rtl/interfaces/VX_exec_unit_req_if.v -../rtl/interfaces/VX_backend_req_if.v -../rtl/interfaces/VX_gpr_read_if.v -../rtl/interfaces/VX_gpu_inst_req_if.v -../rtl/interfaces/VX_inst_meta_if.v -../rtl/interfaces/VX_jal_rsp_if.v +../rtl/interfaces/VX_decode_if.v +../rtl/interfaces/VX_gpr_data_if.v +../rtl/interfaces/VX_gpu_req_if.v ../rtl/interfaces/VX_join_if.v ../rtl/interfaces/VX_lsu_req_if.v ../rtl/interfaces/VX_warp_ctl_if.v ../rtl/interfaces/VX_wb_if.v ../rtl/interfaces/VX_wstall_if.v - -../rtl/libs/VX_generic_register.v -../rtl/libs/VX_mult.v -../rtl/libs/VX_divide.v -../rtl/libs/VX_generic_stack.v -../rtl/libs/VX_priority_encoder.v -../rtl/libs/VX_generic_queue.v -../rtl/libs/VX_indexable_queue.v -../rtl/libs/VX_fair_arbiter.v -../rtl/libs/VX_fixed_arbiter.v -../rtl/libs/VX_rr_arbiter.v -../rtl/libs/VX_countones.v -../rtl/libs/VX_scope.v - -../rtl/Vortex.v +../rtl/interfaces/VX_csr_io_rsp_if.v +../rtl/interfaces/VX_ifetch_req_if.v +../rtl/interfaces/VX_ifetch_rsp_if.v +../rtl/interfaces/VX_mul_req_if.v +../rtl/interfaces/VX_perf_cntrs_if.v ++incdir+../rtl +../rtl/VX_alu_unit.v +../rtl/VX_commit.v ../rtl/VX_cluster.v ../rtl/VX_core.v -../rtl/VX_mem_unit.v -../rtl/VX_pipeline.v -../rtl/VX_front_end.v -../rtl/VX_back_end.v -../rtl/VX_fetch.v -../rtl/VX_scheduler.v -../rtl/VX_exec_unit.v -../rtl/VX_warp.v -../rtl/VX_icache_stage.v -../rtl/VX_gpr_wrapper.v -../rtl/VX_gpu_inst.v -../rtl/VX_writeback.v -../rtl/VX_csr_pipe.v ../rtl/VX_csr_data.v ../rtl/VX_csr_arb.v +../rtl/VX_dcache_arb.v +../rtl/VX_decode.v ../rtl/VX_csr_io_arb.v -../rtl/VX_warp_sched.v +../rtl/VX_fetch.v +../rtl/VX_csr_unit.v ../rtl/VX_gpr_ram.v ../rtl/VX_gpr_stage.v -../rtl/VX_alu_unit.v +../rtl/VX_execute.v +../rtl/VX_gpu_unit.v +../rtl/VX_icache_stage.v +../rtl/VX_issue.v ../rtl/VX_lsu_unit.v -../rtl/VX_decode.v -../rtl/VX_inst_multiplex.v -../rtl/VX_dcache_arb.v ../rtl/VX_mem_arb.v -../rtl/VX_f_d_reg.v -../rtl/VX_i_d_reg.v -../rtl/VX_d_e_reg.v - -ccip_interface_reg.sv -ccip_std_afu.sv -vortex_afu.sv \ No newline at end of file +../rtl/VX_mem_unit.v +../rtl/VX_pipeline.v +../rtl/VX_scheduler.v +../rtl/VX_issue_mux.v +../rtl/VX_warp_sched.v +../rtl/VX_writeback.v +../rtl/Vortex.v +../rtl/VX_mul_unit.v diff --git a/hw/opae/sources_1c.txt b/hw/opae/sources_1c.txt index 8cbd9b8f..b40f7162 100644 --- a/hw/opae/sources_1c.txt +++ b/hw/opae/sources_1c.txt @@ -1,3 +1,21 @@ +define+NUM_CORES=1 +#+define+SCOPE + +#+define+DBG_PRINT_CORE_ICACHE +#+define+DBG_PRINT_CORE_DCACHE +#+define+DBG_PRINT_CACHE_BANK +#+define+DBG_PRINT_CACHE_SNP +#+define+DBG_PRINT_CACHE_MSRQ +#+define+DBG_PRINT_DRAM +#+define+DBG_PRINT_PIPELINE +#+define+DBG_PRINT_OPAE +#+define+DBG_PRINT_SCOPE + +vortex_afu.json +QI:vortex_afu.qsf +ccip_interface_reg.sv +ccip_std_afu.sv +vortex_afu.sv + C:sources.txt \ No newline at end of file diff --git a/hw/opae/sources_2c.txt b/hw/opae/sources_2c.txt index d32f448f..ca991ef9 100644 --- a/hw/opae/sources_2c.txt +++ b/hw/opae/sources_2c.txt @@ -1,4 +1,10 @@ +define+NUM_CORES=2 +define+L2_ENABLE=0 +vortex_afu.json +QI:vortex_afu.qsf +ccip_interface_reg.sv +ccip_std_afu.sv +vortex_afu.sv + C:sources.txt \ No newline at end of file diff --git a/hw/opae/sources_4c.txt b/hw/opae/sources_4c.txt index 03959c74..6ee3aa06 100644 --- a/hw/opae/sources_4c.txt +++ b/hw/opae/sources_4c.txt @@ -1,4 +1,10 @@ +define+NUM_CORES=4 +define+L2_ENABLE=0 +vortex_afu.json +QI:vortex_afu.qsf +ccip_interface_reg.sv +ccip_std_afu.sv +vortex_afu.sv + C:sources.txt \ No newline at end of file diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index ff4c2b05..55ea3dd4 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -1,4 +1,6 @@ `include "VX_define.vh" +`include "fpnew_pkg.sv" +`include "defs_div_sqrt_mvp.sv" module VX_alu_unit #( parameter CORE_ID = 0 @@ -13,7 +15,7 @@ module VX_alu_unit #( VX_branch_ctl_if branch_ctl_if, VX_commit_if alu_commit_if ); - wire [`NUM_THREADS-1:0][31:0] alu_result; + reg [`NUM_THREADS-1:0][31:0] alu_result; wire [`NUM_THREADS-1:0][32:0] sub_result; wire [`NUM_THREADS-1:0][32:0] shift_result; @@ -99,7 +101,7 @@ module VX_alu_unit #( ); VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + (`NUM_THREADS * 32)), + .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)) ) alu_reg ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index 02ae7aa0..457d8308 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -11,6 +11,7 @@ module VX_commit #( VX_commit_if lsu_commit_if, VX_commit_if mul_commit_if, VX_commit_if csr_commit_if, + VX_commit_if fpu_commit_if, VX_commit_if gpu_commit_if, // outputs @@ -20,9 +21,10 @@ module VX_commit #( wire [`NUM_EXS-1:0] commited_mask; assign commited_mask = {((| alu_commit_if.valid) && alu_commit_if.ready), - ((| lsu_commit_if.valid) && lsu_commit_if.ready), - ((| mul_commit_if.valid) && mul_commit_if.ready), + ((| lsu_commit_if.valid) && lsu_commit_if.ready), ((| csr_commit_if.valid) && csr_commit_if.ready), + ((| mul_commit_if.valid) && mul_commit_if.ready), + ((| fpu_commit_if.valid) && fpu_commit_if.ready), ((| gpu_commit_if.valid) && gpu_commit_if.ready)}; wire [`NE_BITS:0] num_commits; @@ -65,6 +67,7 @@ module VX_commit #( .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), + .fpu_commit_if (fpu_commit_if), .writeback_if (writeback_if) ); @@ -77,11 +80,14 @@ module VX_commit #( if ((| lsu_commit_if.valid) && lsu_commit_if.ready) begin $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=LSU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, lsu_commit_if.warp_num, lsu_commit_if.curr_PC, lsu_commit_if.wb, lsu_commit_if.rd, lsu_commit_if.data); end - if ((| mul_commit_if.valid) && mul_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=MUL, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, mul_commit_if.warp_num, mul_commit_if.curr_PC, mul_commit_if.wb, mul_commit_if.rd, mul_commit_if.data); - end if ((| csr_commit_if.valid) && csr_commit_if.ready) begin $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=CSR, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, csr_commit_if.warp_num, csr_commit_if.curr_PC, csr_commit_if.wb, csr_commit_if.rd, csr_commit_if.data); + end + if ((| mul_commit_if.valid) && mul_commit_if.ready) begin + $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=MUL, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, mul_commit_if.warp_num, mul_commit_if.curr_PC, mul_commit_if.wb, mul_commit_if.rd, mul_commit_if.data); + end + if ((| fpu_commit_if.valid) && fpu_commit_if.ready) begin + $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=FPU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, fpu_commit_if.warp_num, fpu_commit_if.curr_PC, fpu_commit_if.wb, fpu_commit_if.rd, fpu_commit_if.data); end if ((| gpu_commit_if.valid) && gpu_commit_if.ready) begin $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=GPU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, gpu_commit_if.warp_num, gpu_commit_if.curr_PC, gpu_commit_if.wb, gpu_commit_if.rd, gpu_commit_if.data); diff --git a/hw/rtl/VX_csr_arb.v b/hw/rtl/VX_csr_arb.v index 7ed17e52..48dc6124 100644 --- a/hw/rtl/VX_csr_arb.v +++ b/hw/rtl/VX_csr_arb.v @@ -15,40 +15,41 @@ module VX_csr_arb ( VX_commit_if csr_rsp_if, // outputs - VX_csr_io_rsp_if csr_io_rsp_if, - VX_commit_if csr_commit_if + VX_commit_if csr_commit_if, + VX_csr_io_rsp_if csr_io_rsp_if, + + input wire select_io_req, + input wire select_io_rsp ); `UNUSED_VAR (clk) `UNUSED_VAR (reset) - wire core_select = ~(| csr_io_req_if.valid); - // requests - assign csr_req_if.valid = core_select ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}}; - assign csr_req_if.warp_num = core_select ? csr_core_req_if.warp_num : 0; - assign csr_req_if.curr_PC = core_select ? csr_core_req_if.curr_PC : 0; - assign csr_req_if.csr_op = core_select ? csr_core_req_if.csr_op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS); - assign csr_req_if.csr_addr = core_select ? csr_core_req_if.csr_addr : csr_io_req_if.addr; - assign csr_req_if.csr_mask = core_select ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); - assign csr_req_if.rd = core_select ? csr_core_req_if.rd : 0; - assign csr_req_if.wb = core_select ? csr_core_req_if.wb : 0; - assign csr_req_if.is_io = ~core_select; + assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}}; + assign csr_req_if.warp_num = (~select_io_req) ? csr_core_req_if.warp_num : 0; + assign csr_req_if.curr_PC = (~select_io_req) ? csr_core_req_if.curr_PC : 0; + assign csr_req_if.csr_op = (~select_io_req) ? csr_core_req_if.csr_op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS); + assign csr_req_if.csr_addr = (~select_io_req) ? csr_core_req_if.csr_addr : csr_io_req_if.addr; + assign csr_req_if.csr_mask = (~select_io_req) ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); + assign csr_req_if.rd = (~select_io_req) ? csr_core_req_if.rd : 0; + assign csr_req_if.wb = (~select_io_req) ? csr_core_req_if.wb : 0; + assign csr_req_if.is_io = select_io_req; - assign csr_core_req_if.ready = csr_req_if.ready && core_select; - assign csr_io_req_if.ready = csr_req_if.ready && ~core_select; + assign csr_core_req_if.ready = csr_req_if.ready && (~select_io_req); + assign csr_io_req_if.ready = csr_req_if.ready && select_io_req; // responses - assign csr_io_rsp_if.valid = csr_rsp_if.valid[0] & csr_rsp_if.is_io; + assign csr_io_rsp_if.valid = csr_rsp_if.valid[0] & select_io_rsp; assign csr_io_rsp_if.data = csr_rsp_if.data[0]; - assign csr_commit_if.valid = csr_rsp_if.valid & {`NUM_THREADS{~csr_rsp_if.is_io}}; - assign csr_commit_if.warp_num = csr_rsp_if.warp_num; - assign csr_commit_if.curr_PC = csr_rsp_if.curr_PC; - assign csr_commit_if.data = csr_rsp_if.data; - assign csr_commit_if.rd = csr_rsp_if.rd; - assign csr_commit_if.wb = csr_rsp_if.wb; + assign csr_commit_if.valid = csr_rsp_if.valid & {`NUM_THREADS{~select_io_rsp}}; + assign csr_commit_if.warp_num = csr_rsp_if.warp_num; + assign csr_commit_if.curr_PC = csr_rsp_if.curr_PC; + assign csr_commit_if.data = csr_rsp_if.data; + assign csr_commit_if.rd = csr_rsp_if.rd; + assign csr_commit_if.wb = csr_rsp_if.wb; - assign csr_rsp_if.ready = csr_rsp_if.is_io ? csr_io_rsp_if.ready : csr_commit_if.ready; + assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : csr_commit_if.ready; endmodule diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 15c3d173..96b4364c 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -6,7 +6,8 @@ module VX_csr_unit #( input wire clk, input wire reset, - VX_perf_cntrs_if perf_cntrs_if, + VX_perf_cntrs_if perf_cntrs_if, + VX_fpu_to_csr_if fpu_to_csr_if, VX_csr_io_req_if csr_io_req_if, VX_csr_io_rsp_if csr_io_rsp_if, @@ -17,15 +18,23 @@ module VX_csr_unit #( VX_csr_req_if csr_pipe_req_if(); VX_commit_if csr_pipe_commit_if(); + wire select_io_req = (| csr_io_req_if.valid); + wire select_io_rsp; + VX_csr_arb csr_arb ( .clk (clk), .reset (reset), + .csr_core_req_if (csr_req_if), .csr_io_req_if (csr_io_req_if), .csr_req_if (csr_pipe_req_if), + .csr_rsp_if (csr_pipe_commit_if), .csr_io_rsp_if (csr_io_rsp_if), - .csr_commit_if (csr_commit_if) + .csr_commit_if (csr_commit_if), + + .select_io_req (select_io_req), + .select_io_rsp (select_io_rsp) ); wire [`CSR_ADDR_SIZE-1:0] csr_addr_s2; @@ -68,14 +77,14 @@ module VX_csr_unit #( wire stall = ~csr_pipe_commit_if.ready && (| csr_pipe_commit_if.valid); VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + `CSR_ADDR_SIZE + 1 + 32 + 32) + .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + `CSR_ADDR_SIZE + 1 + 32 + 32) ) csr_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({csr_pipe_req_if.valid, csr_pipe_req_if.warp_num, csr_pipe_req_if.curr_PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data, csr_updated_data}), - .out ({csr_pipe_commit_if.valid, csr_pipe_commit_if.warp_num, csr_pipe_commit_if.curr_PC, csr_pipe_commit_if.rd, csr_pipe_commit_if.wb, csr_addr_s2, csr_pipe_commit_if.is_io, csr_read_data_s2, csr_updated_data_s2}) + .in ({csr_pipe_req_if.valid, csr_pipe_req_if.warp_num, csr_pipe_req_if.curr_PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data, csr_updated_data}), + .out ({csr_pipe_commit_if.valid, csr_pipe_commit_if.warp_num, csr_pipe_commit_if.curr_PC, csr_pipe_commit_if.rd, csr_pipe_commit_if.wb, csr_addr_s2, select_io_rsp, csr_read_data_s2, csr_updated_data_s2}) ); genvar i; diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 7d90d21b..f5cc7549 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -19,10 +19,11 @@ module VX_decode #( wire [31:0] instr = ifetch_rsp_if.instr; reg [`ALU_BITS-1:0] alu_op; - reg [`BR_BITS-1:0] br_op; - reg [`MUL_BITS-1:0] mul_op; + reg [`BR_BITS-1:0] br_op; wire [`LSU_BITS-1:0] lsu_op; reg [`CSR_BITS-1:0] csr_op; + reg [`MUL_BITS-1:0] mul_op; + reg [`FPU_BITS-1:0] fpu_op; reg [`GPU_BITS-1:0] gpu_op; reg [19:0] upper_imm; @@ -37,6 +38,7 @@ module VX_decode #( wire [`NR_BITS-1:0] rd = instr[11:7]; wire [`NR_BITS-1:0] rs1 = instr[19:15]; wire [`NR_BITS-1:0] rs2 = instr[24:20]; + wire [`NR_BITS-1:0] rs3 = instr[31:27]; // opcode types wire is_rtype = (opcode == `INST_R); @@ -51,10 +53,9 @@ module VX_decode #( wire is_jals = (opcode == `INST_SYS) && (func3 == 0); wire is_csr = (opcode == `INST_SYS) && (func3 != 0); wire is_gpu = (opcode == `INST_GPU); - wire is_br = (is_btype || is_jal || is_jalr || is_jals); - wire is_mul = is_rtype && (func7 == 7'h1); - + // upper immediate + always @(*) begin case (opcode) `INST_LUI: upper_imm = {func7, rs2, rs1, func3}; @@ -63,20 +64,8 @@ module VX_decode #( endcase end - // JAL - wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0}; - wire [31:0] jal_offset = {{11{jal_imm[20]}}, jal_imm}; - wire [11:0] jalr_imm = {func7, rs2}; - wire [31:0] jalr_offset = {{20{jalr_imm[11]}}, jalr_imm}; - always @(*) begin - case (opcode) - `INST_JAL: jalx_offset = jal_offset; - `INST_JALR: jalx_offset = jalr_offset; - default: jalx_offset = 32'd4; - endcase - end - // I-type immediate + wire alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5); wire [11:0] alu_shift_imm = {{7{1'b0}}, rs2}; wire [11:0] alu_imm = alu_shift_i ? alu_shift_imm : u_12; @@ -88,9 +77,26 @@ module VX_decode #( `INST_B: src2_imm = {{20{instr[31]}}, instr[7], instr[30:25], instr[11:8], 1'b0}; default: src2_imm = 32'hdeadbeef; endcase - end + end + + // JAL + + wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0}; + wire [31:0] jal_offset = {{11{jal_imm[20]}}, jal_imm}; + wire [11:0] jalr_imm = {func7, rs2}; + wire [31:0] jalr_offset = {{20{jalr_imm[11]}}, jalr_imm}; + always @(*) begin + case (opcode) + `INST_JAL: jalx_offset = jal_offset; + `INST_JALR: jalx_offset = jalr_offset; + default: jalx_offset = 32'd4; + endcase + end // BRANCH + + wire is_br = (is_btype || is_jal || is_jalr || is_jals); + always @(*) begin br_op = `BR_EQ; case (opcode) @@ -119,6 +125,7 @@ module VX_decode #( end // ALU + always @(*) begin alu_op = `ALU_OTHER; if (is_lui) begin @@ -140,7 +147,29 @@ module VX_decode #( end end - // MUL + // LSU + + wire is_lsu = (is_ltype || is_stype); + assign lsu_op = {is_stype, func3}; + + // CSR + + wire is_csr_imm = is_csr && (func3[2] == 1); + + always @(*) begin + csr_op = `CSR_OTHER; + case (func3[1:0]) + 2'h1: csr_op = `CSR_RW; + 2'h2: csr_op = `CSR_RS; + 2'h3: csr_op = `CSR_RC; + default:; + endcase + end + + // MUL + + wire is_mul = is_rtype && (func7 == 7'h1); + always @(*) begin mul_op = `MUL_MUL; case (func3) @@ -156,23 +185,50 @@ module VX_decode #( endcase end - // LSU - wire is_lsu = (is_ltype || is_stype); - assign lsu_op = {is_stype, func3}; + // FPU - // CSR - wire is_csr_imm = is_csr && (func3[2] == 1); - always @(*) begin - csr_op = `CSR_OTHER; - case (func3[1:0]) - 2'h1: csr_op = `CSR_RW; - 2'h2: csr_op = `CSR_RS; - 2'h3: csr_op = `CSR_RC; - default:; - endcase + wire is_fl = (opcode == `INST_FL) && ((func3 == 2)); + wire is_fs = (opcode == `INST_FS) && ((func3 == 2)); + wire is_fci = (opcode == `INST_FCI); + wire is_fmadd = (opcode == `INST_FMADD); + wire is_fmsub = (opcode == `INST_FMSUB); + wire is_fnmsub = (opcode == `INST_FNMSUB); + wire is_fnmadd = (opcode == `INST_FNMADD); + wire is_fr4 = is_fmadd || is_fmsub || is_fnmsub || is_fnmadd; + wire is_fpu = (is_fl || is_fs || is_fci || is_fr4); + + always @(*) begin + fpu_op = `FPU_OTHER; + if (is_fr4) begin + case ({is_fmadd, is_fmsub, is_fnmsub, is_fnmadd}) + 4'b1000: fpu_op = `FPU_MADD; + 4'b0100: fpu_op = `FPU_MSUB; + 4'b0010: fpu_op = `FPU_NMSUB; + 4'b0001: fpu_op = `FPU_NMADD; + default:; + endcase + end + else begin + case (func7) + 7'h00: fpu_op = `FPU_ADD; + 7'h04: fpu_op = `FPU_SUB; + 7'h08: fpu_op = `FPU_MUL; + 7'h0C: fpu_op = `FPU_DIV; + 7'h2C: fpu_op = `FPU_SQRT; + 7'h14: fpu_op = (func3 == 3'h0) ? `FPU_MIN : `FPU_MAX; + 7'h60: fpu_op = (instr[20]) ? `FPU_CVTWUS : `FPU_CVTWS; // doesn't need rs2, and read rs1 from fpReg, WB to intReg + 7'h68: fpu_op = (instr[20]) ? `FPU_CVTSWU : `FPU_CVTSW; // doesn't need rs2, and read rs1 from intReg + 7'h70: fpu_op = (func3 == 3'h0) ? `FPU_MVXW : `FPU_CLASS; // both wb to intReg + 7'h78: fpu_op = `FPU_MVWX; + 7'h50: fpu_op = `FPU_CMP; // wb to intReg + 7'h10: fpu_op = (func3[1]) ? `FPU_SGNJX : ((func3[0]) ? `FPU_SGNJN : `FPU_SGNJ); + default:; + endcase + end end // GPU + always @(*) begin gpu_op = `GPU_OTHER; case (func3) @@ -195,23 +251,23 @@ module VX_decode #( assign decode_tmp_if.ex_type = is_lsu ? `EX_LSU : is_csr ? `EX_CSR : is_mul ? `EX_MUL : - is_gpu ? `EX_GPU : - is_br ? `EX_ALU : - (is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU : - `EX_NOP; + is_fpu ? `EX_FPU : + is_gpu ? `EX_GPU : + is_br ? `EX_ALU : + (is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU : + `EX_NOP; assign decode_tmp_if.instr_op = is_lsu ? `OP_BITS'(lsu_op) : is_csr ? `OP_BITS'(csr_op) : is_mul ? `OP_BITS'(mul_op) : - is_gpu ? `OP_BITS'(gpu_op) : - is_br ? `OP_BITS'({1'b1, br_op}) : - (is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) : - 0; + is_fpu ? `OP_BITS'(fpu_op) : + is_gpu ? `OP_BITS'(gpu_op) : + is_br ? `OP_BITS'({1'b1, br_op}) : + (is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) : + 0; assign decode_tmp_if.rd = rd; - assign decode_tmp_if.rs1 = is_lui ? `NR_BITS'(0) : rs1; - assign decode_tmp_if.rs2 = rs2; assign decode_tmp_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} : @@ -220,20 +276,22 @@ module VX_decode #( src2_imm; assign decode_tmp_if.rs1_is_PC = is_auipc; - - assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm; + assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm; assign decode_tmp_if.use_rs1 = (decode_tmp_if.rs1 != 0) && (is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu); assign decode_tmp_if.use_rs2 = (decode_tmp_if.rs2 != 0) && (is_btype || is_stype || is_rtype || (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN))); + + assign decode_tmp_if.rs1_is_fp = (is_fci && ((func7 != 7'h68) && (fpu_op != `FPU_MVWX)) || is_fr4); + assign decode_tmp_if.rs2_is_fp = is_fs || (is_fci && ((func7 != 7'h60) && (func7 != 7'h68)) || is_fr4); + assign decode_tmp_if.rs3 = rs3; + assign decode_tmp_if.use_rs3 = is_fr4; + assign decode_tmp_if.frm = func3; - assign decode_tmp_if.wb = (rd == 0) ? `WB_NO : // disable writeback to r0 - (is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU : - (is_jal || is_jalr || is_jals) ? `WB_JAL : - is_ltype ? `WB_MEM : - `WB_NO; + assign decode_tmp_if.wb = (is_fpu && (is_fl || (is_fci && ((func7 != 7'h50) || (func7 != 7'h70) || (func7 != 7'h60))) || is_fr4)) + || (~is_fpu && (rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype)); assign join_if.is_join = in_valid && is_gpu && (gpu_op == `GPU_JOIN); assign join_if.warp_num = ifetch_rsp_if.warp_num; @@ -241,17 +299,17 @@ module VX_decode #( assign wstall_if.wstall = in_valid && (is_btype || is_jal || is_jalr || (is_gpu && (gpu_op == `GPU_TMC || gpu_op == `GPU_SPLIT || gpu_op == `GPU_BAR))); assign wstall_if.warp_num = ifetch_rsp_if.warp_num; - wire stall = ~decode_if.ready && (| decode_if.valid); - + wire stall = ~decode_if.ready && (| decode_if.valid); + VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + `WB_BITS) + .N(`NUM_THREADS + `NW_BITS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + `FRM_BITS) ) decode_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.instr_op, decode_tmp_if.wb}), - .out ({decode_if.valid, decode_if.warp_num, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.instr_op, decode_if.wb}) + .in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.instr_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.frm}), + .out ({decode_if.valid, decode_if.warp_num, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.instr_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.frm}) ); assign ifetch_rsp_if.ready = ~stall; @@ -263,9 +321,7 @@ module VX_decode #( print_ex_type(decode_tmp_if.ex_type); $write(", op="); print_instr_op(decode_tmp_if.ex_type, decode_tmp_if.instr_op); - $write(", wb="); - print_wb(decode_tmp_if.wb); - $write(", rd=%0d, rs1=%0d, rs2=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b\n", decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2); + $write(", wb=%b, rd=%0d, rs1=%0d, rs2=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b\n", decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2); // trap unsupported instructions assert(~(~stall && (decode_tmp_if.ex_type == `EX_ALU) && `ALU_OP(decode_tmp_if.instr_op) == `ALU_OTHER)); diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index b1b8759d..8751847e 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -76,7 +76,7 @@ `define CSR_WIDTH 12 -`define DIV_LATENCY 2 +`define DIV_LATENCY 21 `define MUL_LATENCY 2 @@ -390,6 +390,8 @@ /////////////////////////////////////////////////////////////////////////////// + + task print_ex_type; input [`EX_BITS-1:0] ex; begin diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index adbd7c30..086227bb 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -19,12 +19,13 @@ module VX_execute #( // perf VX_perf_cntrs_if perf_cntrs_if, - + // inputs VX_alu_req_if alu_req_if, VX_lsu_req_if lsu_req_if, VX_csr_req_if csr_req_if, VX_mul_req_if mul_req_if, + VX_fpu_req_if fpu_req_if, VX_gpu_req_if gpu_req_if, // outputs @@ -34,10 +35,13 @@ module VX_execute #( VX_commit_if lsu_commit_if, VX_commit_if csr_commit_if, VX_commit_if mul_commit_if, + VX_commit_if fpu_commit_if, VX_commit_if gpu_commit_if, output wire ebreak ); + VX_fpu_to_csr_if fpu_to_csr_if(); + VX_fpu_from_csr_if fpu_from_csr_if(); VX_alu_unit #( .CORE_ID(CORE_ID) @@ -67,6 +71,7 @@ module VX_execute #( .clk (clk), .reset (reset), .perf_cntrs_if (perf_cntrs_if), + .fpu_to_csr_if (fpu_to_csr_if), .csr_io_req_if (csr_io_req_if), .csr_io_rsp_if (csr_io_rsp_if), .csr_req_if (csr_req_if), @@ -82,6 +87,17 @@ module VX_execute #( .mul_commit_if (mul_commit_if) ); + VX_fpu_unit #( + .CORE_ID(CORE_ID) + ) fpu_unit ( + .clk (clk), + .reset (reset), + .fpu_req_if (fpu_req_if), + .fpu_from_csr_if(fpu_from_csr_if), + .fpu_to_csr_if (fpu_to_csr_if), + .fpu_commit_if (fpu_commit_if) + ); + VX_gpu_unit #( .CORE_ID(CORE_ID) ) gpu_unit ( diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v new file mode 100644 index 00000000..770dc53f --- /dev/null +++ b/hw/rtl/VX_fpu_unit.v @@ -0,0 +1,140 @@ +`include "VX_define.vh" + +module VX_fpu_unit #( + parameter CORE_ID = 0 +) ( + // inputs + input wire clk, + input wire reset, + + // inputs + VX_fpu_req_if fpu_req_if, + VX_fpu_from_csr_if fpu_from_csr_if, + + // outputs + VX_commit_if fpu_commit_if, + VX_fpu_to_csr_if fpu_to_csr_if +); + localparam FOP_BITS = fpnew_pkg::OP_BITS; + localparam FMTF_BITS = $clog2(fpnew_pkg::NUM_FP_FORMATS); + localparam FMTI_BITS = $clog2(fpnew_pkg::NUM_INT_FORMATS); + + localparam int FPU_DPATHW = `NUM_THREADS * 32; + + localparam fpnew_pkg::fpu_features_t FPU_FEATURES = '{ + Width: FPU_DPATHW, + EnableVectors: 1, + EnableNanBox: 1, + FpFmtMask: 5'b10000, + IntFmtMask: 4'b0010 + }; + + localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{ + PipeRegs:'{'{`LATENCY_FMULADD, 0, 0, 0, 0}, // ADDMUL + '{default: `LATENCY_FDIVSQRT}, // DIVSQRT + '{default: `LATENCY_FNONCOMP}, // NONCOMP + '{default: `LATENCY_FCONV}}, // CONV + UnitTypes:'{'{default: fpnew_pkg::PARALLEL}, // ADDMUL + '{default: fpnew_pkg::MERGED}, // DIVSQRT + '{default: fpnew_pkg::PARALLEL}, // NONCOMP + '{default: fpnew_pkg::MERGED}}, // CONV + PipeConfig: fpnew_pkg::DISTRIBUTED + }; + + wire fpu_in_ready; + wire fpu_in_valid; + wire fpu_out_ready; + wire fpu_out_valid; + + wire [2:0][`NUM_THREADS-1:0][31:0] fpu_operands; + + wire [FMTF_BITS-1:0] fpu_src_fmt = fpnew_pkg::FP32; + wire [FMTF_BITS-1:0] fpu_dst_fmt = fpnew_pkg::FP32; + wire [FMTI_BITS-1:0] fpu_int_fmt = fpnew_pkg::INT32; + + assign fpu_in_valid = (| fpu_req_if.valid); + assign fpu_operands[0] = fpu_req_if.rs1_data; + assign fpu_operands[1] = fpu_req_if.rs2_data; + assign fpu_operands[2] = fpu_req_if.rs3_data; + assign fpu_req_if.ready = fpu_in_ready; + + wire [`NUM_THREADS-1:0][31:0] fpu_result; + fpnew_pkg::status_t fpu_status; + + reg [FOP_BITS-1:0] fpu_op; + reg [`FRM_BITS-1:0] fpu_rnd; + reg fpu_op_mod; + + always @(*) begin + fpu_op = fpnew_pkg::SGNJ; + fpu_op_mod = 0; + fpu_rnd = fpu_req_if.frm; + case (fpu_req_if.fpu_op) + `FPU_ADD: fpu_op = fpnew_pkg::ADD; + `FPU_SUB: begin fpu_op = fpnew_pkg::ADD; fpu_op_mod = 1; end + `FPU_MUL: fpu_op = fpnew_pkg::MUL; + `FPU_DIV: fpu_op = fpnew_pkg::DIV; + `FPU_SQRT: fpu_op = fpnew_pkg::SQRT; + `FPU_MADD: fpu_op = fpnew_pkg::FMADD; + `FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end + `FPU_NMSUB: fpu_op = fpnew_pkg::FNMSUB; + `FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end + `FPU_SGNJ: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; end + `FPU_SGNJN: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; end + `FPU_SGNJX: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; end + `FPU_MIN: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end + `FPU_MAX: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end + `FPU_CVTWS: fpu_op = fpnew_pkg::F2I; + `FPU_CVTWUS:begin fpu_op = fpnew_pkg::ADD; fpu_op_mod = 1; end + `FPU_CVTSW: fpu_op = fpnew_pkg::I2F; + `FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end + `FPU_MVXW: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; end + `FPU_MVWX: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; end + `FPU_CLASS: fpu_op = fpnew_pkg::CLASSIFY; + `FPU_CMP: fpu_op = fpnew_pkg::CMP; + default:; + endcase + end + + fpnew_top #( + .Features (FPU_FEATURES), + .Implementation (FPU_IMPLEMENTATION), + .TagType (logic) + ) fpnew_core ( + .clk_i (clk), + .rst_ni (1'b1), + .operands_i (fpu_operands), + .rnd_mode_i (fpu_rnd), + .op_i (fpu_op), + .op_mod_i (fpu_op_mod), + .src_fmt_i (fpu_src_fmt), + .dst_fmt_i (fpu_dst_fmt), + .int_fmt_i (fpu_int_fmt), + .vectorial_op_i (1'b1), + .tag_i (1'b0), + .in_valid_i (fpu_in_valid), + .in_ready_o (fpu_in_ready), + .flush_i (reset), + .result_o (fpu_result), + .status_o (fpu_status), + `UNUSED_PIN (tag_o), + .out_valid_o (fpu_out_valid), + .out_ready_i (fpu_out_ready), + `UNUSED_PIN (busy_o) + ); + + assign fpu_commit_if.valid = fpu_req_if.valid & {`NUM_THREADS{fpu_out_valid}}; + assign fpu_commit_if.data = fpu_result; + assign fpu_commit_if.wb = fpu_req_if.wb; + assign fpu_commit_if.rd = fpu_req_if.rd; + assign fpu_out_ready = fpu_commit_if.ready; + + assign fpu_to_csr_if.valid = fpu_out_valid; + assign fpu_to_csr_if.warp_num = fpu_req_if.warp_num; + assign fpu_to_csr_if.fflags_NV = fpu_status.NV; + assign fpu_to_csr_if.fflags_DZ = fpu_status.DZ; + assign fpu_to_csr_if.fflags_OF = fpu_status.OF; + assign fpu_to_csr_if.fflags_UF = fpu_status.UF; + assign fpu_to_csr_if.fflags_NX = fpu_status.NX; + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v new file mode 100644 index 00000000..bacc36a5 --- /dev/null +++ b/hw/rtl/VX_gpr_fp_ctrl.v @@ -0,0 +1,94 @@ +`include "VX_define.vh" + +// control module to support multi-cycle read for fp register + +module VX_gpr_fp_ctrl ( + input wire clk, + input wire reset, + + VX_decode_if decode_if, + + input wire [`NUM_THREADS-1:0][31:0] rs1_int_data, + input wire [`NUM_THREADS-1:0][31:0] rs2_int_data, + input wire [`NUM_THREADS-1:0][31:0] rs1_fp_data, + input wire [`NUM_THREADS-1:0][31:0] rs2_fp_data, + + // outputs + output wire [`NR_BITS-1:0] raddr1, + output wire [`NR_BITS-1:0] raddr2, + + VX_gpr_data_if gpr_data_if, + + input wire schedule_delay, + output wire gpr_delay +); + // param + localparam GPR_DELAY_WID = 1; + reg [GPR_DELAY_WID-1:0] multi_cyc_state; + + reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data; + reg [`NUM_THREADS-1:0][31:0] tmp_rs2_data; + reg [`NUM_THREADS-1:0][31:0] rs1_data; + reg [`NUM_THREADS-1:0][31:0] rs2_data; + reg [`NUM_THREADS-1:0][31:0] rs3_data; + + always @(posedge clk) begin + if (reset) begin + multi_cyc_state <= 0; + end else if (!schedule_delay) begin + multi_cyc_state <= decode_if.use_rs3 && (multi_cyc_state == 0); + end else begin + multi_cyc_state <= 0; + end + end + + // select rs1 data + always @(posedge clk) begin + if (reset) begin + tmp_rs1_data <= 0; + end else begin + if (decode_if.rs1_is_fp) begin + tmp_rs1_data <= rs1_fp_data; + end else begin + tmp_rs1_data <= decode_if.rs1_is_PC ? {`NUM_THREADS{decode_if.curr_PC}} : rs1_int_data; + end + end + end + + // select rs2 data + always @(posedge clk) begin + if(reset) begin + tmp_rs2_data <= 0; + end else begin + if (decode_if.rs2_is_fp) begin + tmp_rs2_data <= rs2_fp_data; + end else begin + tmp_rs2_data <= decode_if.rs2_is_imm ? {`NUM_THREADS{decode_if.imm}} : rs2_int_data; + end + end + end + + // outputs + + assign gpr_delay = (multi_cyc_state == 0) && decode_if.use_rs3; + + assign raddr1 = multi_cyc_state ? decode_if.rs3 : decode_if.rs1 ; + assign raddr2 = decode_if.rs2; + + always @(*) begin + if (decode_if.use_rs3) begin + rs1_data = tmp_rs1_data; + rs2_data = tmp_rs2_data; + rs3_data = rs1_fp_data; + end else begin + rs1_data = decode_if.rs1_is_fp ? rs1_fp_data : rs1_int_data; + rs2_data = decode_if.rs2_is_fp ? rs2_fp_data : rs2_int_data; + rs3_data = {`NUM_THREADS{32'h8000_0000}}; // default value: -0 in single fp + end + end + + assign gpr_data_if.rs1_data = rs1_data; + assign gpr_data_if.rs2_data = rs2_data; + assign gpr_data_if.rs3_data = rs3_data; + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index a56e7c67..01c9c281 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -4,42 +4,76 @@ module VX_gpr_stage #( parameter CORE_ID = 0 ) ( input wire clk, + input wire reset, // inputs VX_wb_if writeback_if, - VX_decode_if decode_if, + VX_decode_if decode_if, // outputs - VX_gpr_data_if gpr_data_if + VX_gpr_data_if gpr_data_if, + + input wire schedule_delay, + output wire gpr_delay ); - wire [`NUM_THREADS-1:0][31:0] rs1_data_all [`NUM_WARPS-1:0]; - wire [`NUM_THREADS-1:0][31:0] rs2_data_all [`NUM_WARPS-1:0]; - wire [`NUM_THREADS-1:0][31:0] rs1_PC; - wire [`NUM_THREADS-1:0][31:0] rs2_imm; + + wire [`NUM_THREADS-1:0][31:0] rs1_int_data [`NUM_WARPS-1:0]; + wire [`NUM_THREADS-1:0][31:0] rs2_int_data [`NUM_WARPS-1:0]; + wire [`NUM_THREADS-1:0][31:0] rs1_fp_data [`NUM_WARPS-1:0]; + wire [`NUM_THREADS-1:0][31:0] rs2_fp_data [`NUM_WARPS-1:0]; wire [`NUM_THREADS-1:0] we [`NUM_WARPS-1:0]; + wire [`NR_BITS-1:0] raddr1; + wire [`NR_BITS-1:0] raddr2; + genvar i; - for (i = 0; i < `NUM_THREADS; i++) begin - assign rs1_PC[i] = decode_if.curr_PC; - assign rs2_imm[i] = decode_if.imm; - end - - assign gpr_data_if.rs1_data = decode_if.rs1_is_PC ? rs1_PC : rs1_data_all[decode_if.warp_num]; - assign gpr_data_if.rs2_data = decode_if.rs2_is_imm ? rs2_imm : rs2_data_all[decode_if.warp_num]; - for (i = 0; i < `NUM_WARPS; i++) begin assign we[i] = writeback_if.valid & {`NUM_THREADS{(i == writeback_if.warp_num)}}; - VX_gpr_ram gpr_ram ( + + // Int GPRs + VX_gpr_ram gpr_int_ram ( .clk (clk), - .we (we[i]), + .we (we[i] & {`NUM_THREADS{~writeback_if.is_fp}}), .waddr (writeback_if.rd), .wdata (writeback_if.data), - .rs1 (decode_if.rs1), - .rs2 (decode_if.rs2), - .rs1_data (rs1_data_all[i]), - .rs2_data (rs2_data_all[i]) + .rs1 (raddr1), + .rs2 (raddr2), + .rs1_data (rs1_int_data[i]), + .rs2_data (rs2_int_data[i]) ); + + // FP GPRs + VX_gpr_ram gpr_fp_ram ( + .clk (clk), + .we (we[i] & {`NUM_THREADS{writeback_if.is_fp}}), + .waddr (writeback_if.rd), + .wdata (writeback_if.data), + .rs1 (raddr1), + .rs2 (raddr2), + .rs1_data (rs1_fp_data[i]), + .rs2_data (rs2_fp_data[i]) + ); + + // controller for multi-cycle read + VX_gpr_fp_ctrl VX_gpr_fp_ctrl ( + .clk (clk), + .reset (reset), + + //inputs + .decode_if (decode_if), + .rs1_int_data (rs1_int_data[i]), + .rs2_int_data (rs2_int_data[i]), + .rs1_fp_data (rs1_fp_data[i]), + .rs2_fp_data (rs2_fp_data[i]), + + // outputs + .raddr1 (raddr1), + .raddr2 (raddr2), + .gpr_data_if (gpr_data_if), + .schedule_delay (schedule_delay), + .gpr_delay (gpr_delay) + ); end assign writeback_if.ready = 1'b1; diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index f3bcb726..9730fdd2 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -79,7 +79,7 @@ module VX_gpu_unit #( assign gpu_commit_if.valid = gpu_req_if.valid; assign gpu_commit_if.warp_num = gpu_req_if.warp_num; assign gpu_commit_if.curr_PC = gpu_req_if.curr_PC; - assign gpu_commit_if.wb = `WB_NO; + assign gpu_commit_if.wb = 0; assign gpu_commit_if.rd = 0; assign gpu_commit_if.data = 0; diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 737e7b60..261a0927 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -13,16 +13,19 @@ module VX_issue #( VX_lsu_req_if lsu_req_if, VX_csr_req_if csr_req_if, VX_mul_req_if mul_req_if, + VX_fpu_req_if fpu_req_if, VX_gpu_req_if gpu_req_if ); VX_gpr_data_if gpr_data_if(); wire schedule_delay; + wire gpr_delay; - wire alu_busy = ~alu_req_if.ready/* && (| alu_req_if.valid)*/; - wire lsu_busy = ~lsu_req_if.ready/* && (| lsu_req_if.valid)*/; - wire csr_busy = ~csr_req_if.ready/* && (| csr_req_if.valid)*/; - wire mul_busy = ~mul_req_if.ready/* && (| mul_req_if.valid)*/; - wire gpu_busy = ~gpu_req_if.ready/* && (| gpu_req_if.valid)*/; + wire alu_busy = ~alu_req_if.ready; + wire lsu_busy = ~lsu_req_if.ready; + wire csr_busy = ~csr_req_if.ready; + wire mul_busy = ~mul_req_if.ready; + wire fpu_busy = ~mul_req_if.ready; + wire gpu_busy = ~gpu_req_if.ready; VX_scheduler #( .CORE_ID(CORE_ID) @@ -31,10 +34,12 @@ module VX_issue #( .reset (reset), .decode_if (decode_if), .writeback_if (writeback_if), + .gpr_busy (gpr_delay), .alu_busy (alu_busy), .lsu_busy (lsu_busy), .csr_busy (csr_busy), .mul_busy (mul_busy), + .fpu_busy (fpu_busy), .gpu_busy (gpu_busy), .schedule_delay (schedule_delay), `UNUSED_PIN (is_empty) @@ -43,16 +48,20 @@ module VX_issue #( VX_gpr_stage #( .CORE_ID(CORE_ID) ) gpr_stage ( - .clk (clk), + .clk (clk), + .reset (reset), .decode_if (decode_if), .writeback_if (writeback_if), - .gpr_data_if (gpr_data_if) + .gpr_data_if (gpr_data_if), + .schedule_delay (schedule_delay), + .gpr_delay (gpr_delay) ); VX_alu_req_if alu_req_tmp_if(); VX_lsu_req_if lsu_req_tmp_if(); VX_csr_req_if csr_req_tmp_if(); VX_mul_req_if mul_req_tmp_if(); + VX_fpu_req_if fpu_req_tmp_if(); VX_gpu_req_if gpu_req_tmp_if(); VX_issue_mux issue_mux ( @@ -62,6 +71,7 @@ module VX_issue #( .lsu_req_if (lsu_req_tmp_if), .csr_req_if (csr_req_tmp_if), .mul_req_if (mul_req_tmp_if), + .fpu_req_if (fpu_req_tmp_if), .gpu_req_if (gpu_req_tmp_if) ); @@ -69,16 +79,18 @@ module VX_issue #( wire stall_lsu = ~lsu_req_if.ready || schedule_delay; wire stall_csr = ~csr_req_if.ready || schedule_delay; wire stall_mul = ~mul_req_if.ready || schedule_delay; + wire stall_fpu = ~fpu_req_if.ready || schedule_delay; wire stall_gpu = ~gpu_req_if.ready || schedule_delay; wire flush_alu = alu_req_if.ready && schedule_delay; wire flush_lsu = lsu_req_if.ready && schedule_delay; wire flush_csr = csr_req_if.ready && schedule_delay; wire flush_mul = mul_req_if.ready && schedule_delay; + wire flush_fpu = fpu_req_if.ready && schedule_delay; wire flush_gpu = gpu_req_if.ready && schedule_delay; VX_generic_register #( - .N(`NUM_THREADS +`NW_BITS + 32 + `ALU_BITS + `WB_BITS + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + 32) + .N(`NUM_THREADS +`NW_BITS + 32 + `ALU_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + 32) ) alu_reg ( .clk (clk), .reset (reset), @@ -89,7 +101,7 @@ module VX_issue #( ); VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + 1 + `BYTEEN_BITS + `WB_BITS + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32) + .N(`NUM_THREADS + `NW_BITS + 32 + 1 + `BYTEEN_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32) ) lsu_reg ( .clk (clk), .reset (reset), @@ -100,7 +112,7 @@ module VX_issue #( ); VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `CSR_BITS + `WB_BITS + `NR_BITS + `CSR_ADDR_SIZE + 32 + 1) + .N(`NUM_THREADS + `NW_BITS + 32 + `CSR_BITS + 1 + `NR_BITS + `CSR_ADDR_SIZE + 32 + 1) ) csr_reg ( .clk (clk), .reset (reset), @@ -110,8 +122,8 @@ module VX_issue #( .out ({csr_req_if.valid, csr_req_if.warp_num, csr_req_if.curr_PC, csr_req_if.csr_op, csr_req_if.wb, csr_req_if.rd, csr_req_if.csr_addr, csr_req_if.csr_mask, csr_req_if.is_io}) ); - VX_generic_register #( - .N(`NUM_THREADS +`NW_BITS + 32 + `MUL_BITS + `WB_BITS + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) + VX_generic_register #( + .N(`NUM_THREADS +`NW_BITS + 32 + `MUL_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) ) mul_reg ( .clk (clk), .reset (reset), @@ -121,6 +133,17 @@ module VX_issue #( .out ({mul_req_if.valid, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.mul_op, mul_req_if.wb, mul_req_if.rd, mul_req_if.rs1_data, mul_req_if.rs2_data}) ); + VX_generic_register #( + .N(`NUM_THREADS +`NW_BITS + 32 + `FPU_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + `FRM_BITS) + ) fpu_reg ( + .clk (clk), + .reset (reset), + .stall (stall_fpu), + .flush (flush_fpu), + .in ({fpu_req_tmp_if.valid, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.wb, fpu_req_tmp_if.rd, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data, fpu_req_tmp_if.frm}), + .out ({fpu_req_if.valid, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.fpu_op, fpu_req_if.wb, fpu_req_if.rd, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data, fpu_req_if.frm}) + ); + VX_generic_register #( .N(`NUM_THREADS + `NW_BITS + 32 + `GPU_BITS + (`NUM_THREADS * 32) + 32 + 32) ) gpu_reg ( @@ -140,6 +163,9 @@ module VX_issue #( if ((| mul_req_tmp_if.valid) && ~stall_mul) begin $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, op=%0d, wb=%d, rd=%0d, rs1=%0h, rs2=%0h", $time, CORE_ID, mul_req_tmp_if.warp_num, mul_req_tmp_if.curr_PC, mul_req_tmp_if.mul_op, mul_req_tmp_if.wb, mul_req_tmp_if.rd, mul_req_tmp_if.rs1_data, mul_req_tmp_if.rs2_data); end + if ((| fpu_req_tmp_if.valid) && ~stall_fpu) begin + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, op=%0d, wb=%d, rd=%0d, rs1=%0h, rs2=%0h", $time, CORE_ID, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.wb, fpu_req_tmp_if.rd, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data); + end if ((| lsu_req_tmp_if.valid) && ~stall_lsu) begin $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=LSU, rw=%b, wb=%0d, rd=%0d, byteen=%b, baddr=%0h, offset=%0h", $time, CORE_ID, lsu_req_tmp_if.warp_num, lsu_req_tmp_if.curr_PC, lsu_req_tmp_if.rw, lsu_req_tmp_if.rd, lsu_req_tmp_if.wb, lsu_req_tmp_if.byteen, lsu_req_tmp_if.base_addr, lsu_req_tmp_if.offset); end diff --git a/hw/rtl/VX_issue_mux.v b/hw/rtl/VX_issue_mux.v index e55d7986..68ec6159 100644 --- a/hw/rtl/VX_issue_mux.v +++ b/hw/rtl/VX_issue_mux.v @@ -10,6 +10,7 @@ module VX_issue_mux ( VX_lsu_req_if lsu_req_if, VX_csr_req_if csr_req_if, VX_mul_req_if mul_req_if, + VX_fpu_req_if fpu_req_if, VX_gpu_req_if gpu_req_if ); @@ -17,6 +18,7 @@ module VX_issue_mux ( wire[`NUM_THREADS-1:0] is_lsu = {`NUM_THREADS{decode_if.ex_type == `EX_LSU}}; wire[`NUM_THREADS-1:0] is_csr = {`NUM_THREADS{decode_if.ex_type == `EX_CSR}}; wire[`NUM_THREADS-1:0] is_mul = {`NUM_THREADS{decode_if.ex_type == `EX_MUL}}; + wire[`NUM_THREADS-1:0] is_fpu = {`NUM_THREADS{decode_if.ex_type == `EX_FPU}}; wire[`NUM_THREADS-1:0] is_gpu = {`NUM_THREADS{decode_if.ex_type == `EX_GPU}}; // ALU unit @@ -64,6 +66,18 @@ module VX_issue_mux ( assign mul_req_if.rd = decode_if.rd; assign mul_req_if.wb = decode_if.wb; + // FPU unit + assign fpu_req_if.valid = decode_if.valid & is_fpu; + assign fpu_req_if.warp_num = decode_if.warp_num; + assign fpu_req_if.curr_PC = decode_if.curr_PC; + assign fpu_req_if.fpu_op = `FPU_OP(decode_if.instr_op); + assign fpu_req_if.rs1_data = gpr_data_if.rs1_data; + assign fpu_req_if.rs2_data = gpr_data_if.rs2_data; + assign fpu_req_if.rs3_data = gpr_data_if.rs3_data; + assign fpu_req_if.frm = decode_if.frm; + assign fpu_req_if.rd = decode_if.rd; + assign fpu_req_if.wb = decode_if.wb; + // GPU unit assign gpu_req_if.valid = decode_if.valid & is_gpu; assign gpu_req_if.warp_num = decode_if.warp_num; diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index fb0fe514..886e17bd 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -28,8 +28,9 @@ module VX_lsu_unit #( wire [`BYTEEN_BITS-1:0] mem_byteen; wire [`NR_BITS-1:0] use_rd; wire [`NW_BITS-1:0] use_warp_num; - wire [`WB_BITS-1:0] use_wb; + wire use_wb; wire [31:0] use_pc; + wire mrq_full; genvar i; @@ -68,7 +69,7 @@ module VX_lsu_unit #( `IGNORE_WARNINGS_END VX_generic_register #( - .N(`NUM_THREADS + (`NUM_THREADS * 32) + `BYTEEN_BITS + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + `NW_BITS + `WB_BITS + 32) + .N(`NUM_THREADS + (`NUM_THREADS * 32) + `BYTEEN_BITS + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + `NW_BITS + 1 + 32) ) mem_req_reg ( .clk (clk), .reset (reset), @@ -83,8 +84,7 @@ module VX_lsu_unit #( wire [`LOG2UP(`DCREQ_SIZE)-1:0] mrq_write_addr, dbg_mrq_write_addr; wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset; wire [`BYTEEN_BITS-1:0] core_rsp_mem_read; - wire mrq_full; - + wire mrq_push = (| dcache_req_if.valid) && dcache_req_if.ready && (0 == use_req_rw); // only push read requests @@ -97,7 +97,7 @@ module VX_lsu_unit #( wire mrq_pop = mrq_pop_part && (0 == mem_rsp_mask_upd); VX_index_queue #( - .DATAW (`LOG2UP(`DCREQ_SIZE) + 32 + `WB_BITS + (`NUM_THREADS * 2) + `BYTEEN_BITS + `NR_BITS + `NW_BITS), + .DATAW (`LOG2UP(`DCREQ_SIZE) + 32 + 1 + (`NUM_THREADS * 2) + `BYTEEN_BITS + `NR_BITS + `NW_BITS), .SIZE (`DCREQ_SIZE) ) mem_req_queue ( .clk (clk), diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index f6a6976d..4670c224 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -12,7 +12,7 @@ module VX_mul_unit #( // Outputs VX_commit_if mul_commit_if ); - wire [`NUM_THREADS-1:0][31:0] alu_result; + reg [`NUM_THREADS-1:0][31:0] alu_result; wire [`NUM_THREADS-1:0][63:0] mul_result; wire [`NUM_THREADS-1:0][31:0] div_result; wire [`NUM_THREADS-1:0][31:0] rem_result; @@ -36,7 +36,7 @@ module VX_mul_unit #( .WIDTHB(33), .WIDTHP(64), .SIGNED(1), - .PIPELINE(`MUL_LATENCY) + .PIPELINE(`LATENCY_IMUL) ) multiplier ( .clk(clk), .reset(reset), @@ -52,7 +52,7 @@ module VX_mul_unit #( .WIDTHR(32), .NSIGNED(1), .DSIGNED(1), - .PIPELINE(`DIV_LATENCY) + .PIPELINE(`LATENCY_IDIV) ) sdiv ( .clk(clk), .reset(reset), @@ -77,9 +77,11 @@ module VX_mul_unit #( end end + wire stall; + reg result_avail; reg [4:0] pending_ctr; - wire [4:0] instr_delay = `IS_DIV_OP(alu_op) ? `DIV_LATENCY : `MUL_LATENCY; + wire [4:0] instr_delay = `IS_DIV_OP(alu_op) ? `LATENCY_IDIV : `LATENCY_IMUL; always @(posedge clk) begin if (reset) begin @@ -104,13 +106,13 @@ module VX_mul_unit #( wire pipeline_stall = ~result_avail && (| mul_req_if.valid); - wire stall = (~mul_commit_if.ready && (| mul_commit_if.valid)) - || pipeline_stall; + assign stall = (~mul_commit_if.ready && (| mul_commit_if.valid)) + || pipeline_stall; wire flush = mul_commit_if.ready && pipeline_stall; VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + (`NUM_THREADS * 32)), + .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)) ) mul_reg ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index a548e0db..ea4e2e92 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -110,6 +110,7 @@ module VX_pipeline #( VX_lsu_req_if lsu_req_if(); VX_csr_req_if csr_req_if(); VX_mul_req_if mul_req_if(); + VX_fpu_req_if fpu_req_if(); VX_gpu_req_if gpu_req_if(); VX_wb_if writeback_if(); VX_wstall_if wstall_if(); @@ -118,6 +119,7 @@ module VX_pipeline #( VX_commit_if lsu_commit_if(); VX_commit_if csr_commit_if(); VX_commit_if mul_commit_if(); + VX_commit_if fpu_commit_if(); VX_commit_if gpu_commit_if(); VX_fetch #( @@ -159,6 +161,7 @@ module VX_pipeline #( .lsu_req_if (lsu_req_if), .csr_req_if (csr_req_if), .mul_req_if (mul_req_if), + .fpu_req_if (fpu_req_if), .gpu_req_if (gpu_req_if) ); @@ -181,6 +184,7 @@ module VX_pipeline #( .lsu_req_if (lsu_req_if), .csr_req_if (csr_req_if), .mul_req_if (mul_req_if), + .fpu_req_if (fpu_req_if), .gpu_req_if (gpu_req_if), .warp_ctl_if (warp_ctl_if), @@ -189,6 +193,7 @@ module VX_pipeline #( .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), + .fpu_commit_if (fpu_commit_if), .gpu_commit_if (gpu_commit_if), .ebreak (ebreak) @@ -204,6 +209,7 @@ module VX_pipeline #( .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), + .fpu_commit_if (fpu_commit_if), .gpu_commit_if (gpu_commit_if), .writeback_if (writeback_if), diff --git a/hw/rtl/VX_scheduler.v b/hw/rtl/VX_scheduler.v index 1d9f3149..ec49f26a 100644 --- a/hw/rtl/VX_scheduler.v +++ b/hw/rtl/VX_scheduler.v @@ -8,37 +8,43 @@ module VX_scheduler #( VX_decode_if decode_if, VX_wb_if writeback_if, + input wire gpr_busy, input wire alu_busy, input wire lsu_busy, input wire csr_busy, input wire mul_busy, + input wire fpu_busy, input wire gpu_busy, output wire schedule_delay, output wire is_empty ); - localparam CTVW = `CLOG2(`NUM_WARPS * 32 + 1); + localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1); - reg [31:0][`NUM_THREADS-1:0] rename_table[`NUM_WARPS-1:0]; + reg [`NUM_REGS-1:0][`NUM_THREADS-1:0] rename_table [`NUM_WARPS-1:0]; + reg [`NUM_REGS-1:0] busy_table [`NUM_WARPS-1:0]; reg [CTVW-1:0] count_valid; - wire rs1_rename = (rename_table[decode_if.warp_num][decode_if.rs1] != 0); - wire rs2_rename = (rename_table[decode_if.warp_num][decode_if.rs2] != 0); - wire rd_rename = (rename_table[decode_if.warp_num][decode_if.rd ] != 0); + wire rs1_rename = busy_table[decode_if.warp_num][decode_if.rs1]; + wire rs2_rename = busy_table[decode_if.warp_num][decode_if.rs2]; + wire rs3_rename = busy_table[decode_if.warp_num][decode_if.rs3]; + wire rd_rename = busy_table[decode_if.warp_num][decode_if.rd]; - wire rs1_rename_qual = (rs1_rename) && (decode_if.use_rs1); - wire rs2_rename_qual = (rs2_rename) && (decode_if.use_rs2); - wire rd_rename_qual = (rd_rename) && (decode_if.wb != 0); + wire rs1_rename_qual = rs1_rename && decode_if.use_rs1; + wire rs2_rename_qual = rs2_rename && decode_if.use_rs2; + wire rs3_rename_qual = rs3_rename && decode_if.use_rs3; + wire rd_rename_qual = rd_rename && decode_if.wb; - wire rename_valid = (| decode_if.valid) && (rs1_rename_qual || rs2_rename_qual || rd_rename_qual); + wire rename_valid = (rs1_rename_qual || rs2_rename_qual || rs3_rename_qual || rd_rename_qual); - wire ex_stalled = (| decode_if.valid) - && ((alu_busy && (decode_if.ex_type == `EX_ALU)) + wire ex_stalled = ((gpr_busy) + || (alu_busy && (decode_if.ex_type == `EX_ALU)) || (lsu_busy && (decode_if.ex_type == `EX_LSU)) || (csr_busy && (decode_if.ex_type == `EX_CSR)) || (mul_busy && (decode_if.ex_type == `EX_MUL)) + || (fpu_busy && (decode_if.ex_type == `EX_FPU)) || (gpu_busy && (decode_if.ex_type == `EX_GPU))); - wire stall = ex_stalled || rename_valid; + wire stall = (ex_stalled || rename_valid) && (| decode_if.valid); wire acquire_rd = (| decode_if.valid) && (decode_if.wb != 0) && ~stall; @@ -49,23 +55,25 @@ module VX_scheduler #( reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == valid_wb_new_mask))) ? (count_valid + 1) : (~acquire_rd && (release_rd && (0 == valid_wb_new_mask))) ? (count_valid - 1) : count_valid; - integer i, w; - - always @(posedge clk) begin + always @(posedge clk) begin if (reset) begin + integer i, w; for (w = 0; w < `NUM_WARPS; w++) begin for (i = 0; i < 32; i++) begin rename_table[w][i] <= 0; + busy_table[w][i] <= 0; end - end - count_valid <= 0; + end + count_valid <= 0; end else begin if (acquire_rd) begin rename_table[decode_if.warp_num][decode_if.rd] <= decode_if.valid; + busy_table[decode_if.warp_num][decode_if.rd] <= 1; end if (release_rd) begin assert(rename_table[writeback_if.warp_num][writeback_if.rd] != 0); rename_table[writeback_if.warp_num][writeback_if.rd] <= valid_wb_new_mask; + busy_table[writeback_if.warp_num][writeback_if.rd] <= (| valid_wb_new_mask); end count_valid <= count_valid_next; end @@ -80,7 +88,7 @@ module VX_scheduler #( `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (stall) begin - $display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, rename=%b%b%b, alu=%b, lsu=%b, csr=%b, mul=%b, gpu=%b", $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, rd_rename_qual, rs1_rename_qual, rs2_rename_qual, alu_busy, lsu_busy, csr_busy, mul_busy, gpu_busy); + $display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, rename=%b%b%b, alu=%b, lsu=%b, csr=%b, mul=%b, fpu=%b, gpu=%b", $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, rd_rename_qual, rs1_rename_qual, rs2_rename_qual, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy); end end `endif diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index a0040a5b..571244d8 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -18,7 +18,6 @@ module VX_warp_sched #( ); wire update_use_wspawn; wire update_visible_active; - wire scheduled_warp; wire [(1+32+`NUM_THREADS-1):0] ipdom[`NUM_WARPS-1:0]; diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 66797e94..4b724470 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -9,17 +9,19 @@ module VX_writeback #( // inputs VX_commit_if alu_commit_if, VX_commit_if lsu_commit_if, - VX_commit_if mul_commit_if, + VX_commit_if mul_commit_if, + VX_commit_if fpu_commit_if, VX_commit_if csr_commit_if, // outputs VX_wb_if writeback_if ); - wire lsu_valid = (| lsu_commit_if.valid) && (lsu_commit_if.wb != `WB_NO); - wire mul_valid = (| mul_commit_if.valid) && (mul_commit_if.wb != `WB_NO); - wire alu_valid = (| alu_commit_if.valid) && (alu_commit_if.wb != `WB_NO); - wire csr_valid = (| csr_commit_if.valid) && (csr_commit_if.wb != `WB_NO); + wire alu_valid = (| alu_commit_if.valid) && alu_commit_if.wb; + wire lsu_valid = (| lsu_commit_if.valid) && lsu_commit_if.wb; + wire csr_valid = (| csr_commit_if.valid) && csr_commit_if.wb; + wire mul_valid = (| mul_commit_if.valid) && mul_commit_if.wb; + wire fpu_valid = (| fpu_commit_if.valid) && fpu_commit_if.wb; VX_wb_if writeback_tmp_if(); @@ -47,23 +49,26 @@ module VX_writeback #( csr_valid ? csr_commit_if.rd : 0; + assign writeback_tmp_if.is_fp = fpu_valid && fpu_commit_if.ready; + wire stall = ~writeback_if.ready && (| writeback_if.valid); VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + `NR_BITS + (`NUM_THREADS * 32)) + .N(`NUM_THREADS + `NW_BITS + `NR_BITS + (`NUM_THREADS * 32) + 1) ) wb_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.rd, writeback_tmp_if.data}), - .out ({writeback_if.valid, writeback_if.warp_num, writeback_if.rd, writeback_if.data}) + .in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.rd, writeback_tmp_if.data, writeback_tmp_if.is_fp}), + .out ({writeback_if.valid, writeback_if.warp_num, writeback_if.rd, writeback_if.data, writeback_if.is_fp}) ); assign lsu_commit_if.ready = !stall; - assign mul_commit_if.ready = !stall && !lsu_valid; - assign alu_commit_if.ready = !stall && !lsu_valid && !mul_valid; - assign csr_commit_if.ready = !stall && !lsu_valid && !mul_valid && !alu_valid; + assign fpu_commit_if.ready = !stall && !lsu_valid; + assign mul_commit_if.ready = !stall && !lsu_valid && !fpu_valid; + assign alu_commit_if.ready = !stall && !lsu_valid && !fpu_valid && !mul_valid; + assign csr_commit_if.ready = !stall && !lsu_valid && !fpu_valid && !mul_valid && !alu_valid; // special workaround to control RISC-V benchmarks termination on Verilator reg [31:0] last_data_wb /* verilator public */; diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index dfbf0e4a..a606c15f 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -139,54 +139,54 @@ module Vortex ( end else begin - wire per_cluster_dram_req_valid [`NUM_CLUSTERS-1:0]; - wire per_cluster_dram_req_rw [`NUM_CLUSTERS-1:0]; - wire [`L2DRAM_BYTEEN_WIDTH-1:0] per_cluster_dram_req_byteen [`NUM_CLUSTERS-1:0]; - wire [`L2DRAM_ADDR_WIDTH-1:0] per_cluster_dram_req_addr [`NUM_CLUSTERS-1:0]; - wire [`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_req_data [`NUM_CLUSTERS-1:0]; - wire [`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_req_tag [`NUM_CLUSTERS-1:0]; - wire l3_core_req_ready; + wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid; + wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_rw; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_BYTEEN_WIDTH-1:0] per_cluster_dram_req_byteen; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] per_cluster_dram_req_addr; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_req_data; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_req_tag; + wire l3_core_req_ready; - wire per_cluster_dram_rsp_valid [`NUM_CLUSTERS-1:0]; - wire [`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_rsp_data [`NUM_CLUSTERS-1:0]; - wire [`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_rsp_tag [`NUM_CLUSTERS-1:0]; - wire per_cluster_dram_rsp_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_valid; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_rsp_data; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_rsp_tag; + wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_ready; - wire per_cluster_snp_req_valid [`NUM_CLUSTERS-1:0]; - wire [`L2DRAM_ADDR_WIDTH-1:0] per_cluster_snp_req_addr [`NUM_CLUSTERS-1:0]; - wire per_cluster_snp_req_invalidate [`NUM_CLUSTERS-1:0]; - wire [`L2SNP_TAG_WIDTH-1:0] per_cluster_snp_req_tag [`NUM_CLUSTERS-1:0]; - wire per_cluster_snp_req_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] per_cluster_snp_req_valid; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] per_cluster_snp_req_addr; + wire [`NUM_CLUSTERS-1:0] per_cluster_snp_req_invalidate; + wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] per_cluster_snp_req_tag; + wire [`NUM_CLUSTERS-1:0] per_cluster_snp_req_ready; - wire per_cluster_snp_rsp_valid [`NUM_CLUSTERS-1:0]; - wire [`L2SNP_TAG_WIDTH-1:0] per_cluster_snp_rsp_tag [`NUM_CLUSTERS-1:0]; - wire per_cluster_snp_rsp_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] per_cluster_snp_rsp_valid; + wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] per_cluster_snp_rsp_tag; + wire [`NUM_CLUSTERS-1:0] per_cluster_snp_rsp_ready; - wire per_cluster_io_req_valid [`NUM_CLUSTERS-1:0]; - wire per_cluster_io_req_rw [`NUM_CLUSTERS-1:0]; - wire [3:0] per_cluster_io_req_byteen [`NUM_CLUSTERS-1:0]; - wire [29:0] per_cluster_io_req_addr [`NUM_CLUSTERS-1:0]; - wire [31:0] per_cluster_io_req_data [`NUM_CLUSTERS-1:0]; - wire [`L2CORE_TAG_WIDTH-1:0] per_cluster_io_req_tag [`NUM_CLUSTERS-1:0]; - wire per_cluster_io_req_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] per_cluster_io_req_valid; + wire [`NUM_CLUSTERS-1:0] per_cluster_io_req_rw; + wire [`NUM_CLUSTERS-1:0][3:0] per_cluster_io_req_byteen; + wire [`NUM_CLUSTERS-1:0][29:0] per_cluster_io_req_addr; + wire [`NUM_CLUSTERS-1:0][31:0] per_cluster_io_req_data; + wire [`NUM_CLUSTERS-1:0][`L2CORE_TAG_WIDTH-1:0] per_cluster_io_req_tag; + wire [`NUM_CLUSTERS-1:0] per_cluster_io_req_ready; - wire per_cluster_io_rsp_valid [`NUM_CLUSTERS-1:0]; - wire [`L2CORE_TAG_WIDTH-1:0] per_cluster_io_rsp_tag [`NUM_CLUSTERS-1:0]; - wire [31:0] per_cluster_io_rsp_data [`NUM_CLUSTERS-1:0]; - wire per_cluster_io_rsp_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] per_cluster_io_rsp_valid; + wire [`NUM_CLUSTERS-1:0][`L2CORE_TAG_WIDTH-1:0] per_cluster_io_rsp_tag; + wire [`NUM_CLUSTERS-1:0][31:0] per_cluster_io_rsp_data; + wire [`NUM_CLUSTERS-1:0] per_cluster_io_rsp_ready; - wire per_cluster_csr_io_req_valid [`NUM_CLUSTERS-1:0]; - wire [11:0] per_cluster_csr_io_req_addr [`NUM_CLUSTERS-1:0]; - wire per_cluster_csr_io_req_rw [`NUM_CLUSTERS-1:0]; - wire [31:0] per_cluster_csr_io_req_data [`NUM_CLUSTERS-1:0]; - wire per_cluster_csr_io_req_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] per_cluster_csr_io_req_valid; + wire [`NUM_CLUSTERS-1:0][11:0] per_cluster_csr_io_req_addr; + wire [`NUM_CLUSTERS-1:0] per_cluster_csr_io_req_rw; + wire [`NUM_CLUSTERS-1:0][31:0] per_cluster_csr_io_req_data; + wire [`NUM_CLUSTERS-1:0] per_cluster_csr_io_req_ready; - wire per_cluster_csr_io_rsp_valid [`NUM_CLUSTERS-1:0]; - wire [31:0] per_cluster_csr_io_rsp_data [`NUM_CLUSTERS-1:0]; - wire per_cluster_csr_io_rsp_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] per_cluster_csr_io_rsp_valid; + wire [`NUM_CLUSTERS-1:0][31:0] per_cluster_csr_io_rsp_data; + wire [`NUM_CLUSTERS-1:0] per_cluster_csr_io_rsp_ready; - wire per_cluster_busy [`NUM_CLUSTERS-1:0]; - wire per_cluster_ebreak [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] per_cluster_busy; + wire [`NUM_CLUSTERS-1:0] per_cluster_ebreak; wire [`CLOG2(`NUM_CLUSTERS)-1:0] csr_io_request_id = `CLOG2(`NUM_CLUSTERS)'(csr_io_req_coreid >> `CLOG2(`NUM_CLUSTERS)); wire [`NC_BITS-1:0] per_cluster_csr_io_req_coreid = `NC_BITS'(csr_io_req_coreid); @@ -336,27 +336,27 @@ module Vortex ( // L3 Cache /////////////////////////////////////////////////////////// - wire l3_core_req_valid [`L3NUM_REQUESTS-1:0]; - wire l3_core_req_rw [`L3NUM_REQUESTS-1:0]; - wire [`L2DRAM_BYTEEN_WIDTH-1:0] l3_core_req_byteen [`L3NUM_REQUESTS-1:0]; - wire [`L2DRAM_ADDR_WIDTH-1:0] l3_core_req_addr [`L3NUM_REQUESTS-1:0]; - wire [`L2DRAM_LINE_WIDTH-1:0] l3_core_req_data [`L3NUM_REQUESTS-1:0]; - wire [`L2DRAM_TAG_WIDTH-1:0] l3_core_req_tag [`L3NUM_REQUESTS-1:0]; + wire [`L3NUM_REQUESTS-1:0] l3_core_req_valid; + wire [`L3NUM_REQUESTS-1:0] l3_core_req_rw; + wire [`L3NUM_REQUESTS-1:0][`L2DRAM_BYTEEN_WIDTH-1:0] l3_core_req_byteen; + wire [`L3NUM_REQUESTS-1:0][`L2DRAM_ADDR_WIDTH-1:0] l3_core_req_addr; + wire [`L3NUM_REQUESTS-1:0][`L2DRAM_LINE_WIDTH-1:0] l3_core_req_data; + wire [`L3NUM_REQUESTS-1:0][`L2DRAM_TAG_WIDTH-1:0] l3_core_req_tag; - wire l3_core_rsp_valid [`L3NUM_REQUESTS-1:0]; - wire [`L2DRAM_LINE_WIDTH-1:0] l3_core_rsp_data [`L3NUM_REQUESTS-1:0]; - wire [`L2DRAM_TAG_WIDTH-1:0] l3_core_rsp_tag [`L3NUM_REQUESTS-1:0]; - wire l3_core_rsp_ready; + wire [`L3NUM_REQUESTS-1:0] l3_core_rsp_valid; + wire [`L3NUM_REQUESTS-1:0][`L2DRAM_LINE_WIDTH-1:0] l3_core_rsp_data; + wire [`L3NUM_REQUESTS-1:0][`L2DRAM_TAG_WIDTH-1:0] l3_core_rsp_tag; + wire l3_core_rsp_ready; - wire l3_snp_fwdout_valid [`NUM_CLUSTERS-1:0]; - wire [`L2DRAM_ADDR_WIDTH-1:0] l3_snp_fwdout_addr [`NUM_CLUSTERS-1:0]; - wire l3_snp_fwdout_invalidate [`NUM_CLUSTERS-1:0]; - wire [`L2SNP_TAG_WIDTH-1:0] l3_snp_fwdout_tag [`NUM_CLUSTERS-1:0]; - wire l3_snp_fwdout_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] l3_snp_fwdout_valid; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] l3_snp_fwdout_addr; + wire [`NUM_CLUSTERS-1:0] l3_snp_fwdout_invalidate; + wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] l3_snp_fwdout_tag; + wire [`NUM_CLUSTERS-1:0] l3_snp_fwdout_ready; - wire l3_snp_fwdin_valid [`NUM_CLUSTERS-1:0]; - wire [`L2SNP_TAG_WIDTH-1:0] l3_snp_fwdin_tag [`NUM_CLUSTERS-1:0]; - wire l3_snp_fwdin_ready [`NUM_CLUSTERS-1:0]; + wire [`NUM_CLUSTERS-1:0] l3_snp_fwdin_valid; + wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] l3_snp_fwdin_tag; + wire [`NUM_CLUSTERS-1:0] l3_snp_fwdin_ready; for (i = 0; i < `L3NUM_REQUESTS; i++) begin // Core Request diff --git a/hw/rtl/fp_cores/fpu_div_sqrt_mvp b/hw/rtl/fp_cores/fpu_div_sqrt_mvp new file mode 160000 index 00000000..d9a27f3c --- /dev/null +++ b/hw/rtl/fp_cores/fpu_div_sqrt_mvp @@ -0,0 +1 @@ +Subproject commit d9a27f3c4ea72fcc246219c26896360dd61f2806 diff --git a/hw/rtl/interfaces/VX_alu_req_if.v b/hw/rtl/interfaces/VX_alu_req_if.v index 999e0420..98dca661 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.v +++ b/hw/rtl/interfaces/VX_alu_req_if.v @@ -11,7 +11,7 @@ interface VX_alu_req_if (); wire [`ALU_BITS-1:0] alu_op; - wire [`WB_BITS-1:0] wb; + wire wb; wire [`NR_BITS-1:0] rd; wire [`NUM_THREADS-1:0][31:0] rs1_data; diff --git a/hw/rtl/interfaces/VX_commit_if.v b/hw/rtl/interfaces/VX_commit_if.v index 6e969c77..457add5e 100644 --- a/hw/rtl/interfaces/VX_commit_if.v +++ b/hw/rtl/interfaces/VX_commit_if.v @@ -10,8 +10,7 @@ interface VX_commit_if (); wire [31:0] curr_PC; wire [`NUM_THREADS-1:0][31:0] data; wire [`NR_BITS-1:0] rd; - wire [`WB_BITS-1:0] wb; - wire is_io; + wire wb; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_csr_req_if.v b/hw/rtl/interfaces/VX_csr_req_if.v index 2956416a..e585ad5c 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.v +++ b/hw/rtl/interfaces/VX_csr_req_if.v @@ -15,7 +15,7 @@ interface VX_csr_req_if (); wire [31:0] csr_mask; wire [`NR_BITS-1:0] rd; - wire [`WB_BITS-1:0] wb; + wire wb; wire is_io; wire ready; diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index e4b99dc6..12a45198 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -19,12 +19,19 @@ interface VX_decode_if (); wire [31:0] imm; wire rs1_is_PC; - wire rs2_is_imm; + wire rs2_is_imm; wire use_rs1; wire use_rs2; - wire [`WB_BITS-1:0] wb; + // FP states + wire [`NR_BITS-1:0] rs3; + wire use_rs3; + wire rs1_is_fp; + wire rs2_is_fp; + wire [`FRM_BITS-1:0] frm; + + wire wb; wire ready; diff --git a/hw/rtl/interfaces/VX_fpu_from_csr_if.v b/hw/rtl/interfaces/VX_fpu_from_csr_if.v new file mode 100644 index 00000000..9cf03d37 --- /dev/null +++ b/hw/rtl/interfaces/VX_fpu_from_csr_if.v @@ -0,0 +1,16 @@ +`ifndef VX_FPU_FROM_CSR_IF +`define VX_FPU_FROM_CSR_IF + +`include "VX_define.vh" + +interface VX_fpu_from_csr_if (); + +`IGNORE_WARNINGS_BEGIN + + wire [`NUM_WARPS-1:0][`FRM_BITS-1:0] frm; + +`IGNORE_WARNINGS_END + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_fpu_req_if.v b/hw/rtl/interfaces/VX_fpu_req_if.v new file mode 100644 index 00000000..c35f83d3 --- /dev/null +++ b/hw/rtl/interfaces/VX_fpu_req_if.v @@ -0,0 +1,26 @@ +`ifndef VX_FPU_REQ_IF +`define VX_FPU_REQ_IF + +`include "VX_define.vh" + +interface VX_fpu_req_if (); + + wire [`NUM_THREADS-1:0] valid; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; + + wire [`FPU_BITS-1:0] fpu_op; + wire [`FRM_BITS-1:0] frm; + + wire wb; + wire [`NR_BITS-1:0] rd; + + wire [`NUM_THREADS-1:0][31:0] rs1_data; + wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs3_data; + + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_fpu_to_csr_if.v b/hw/rtl/interfaces/VX_fpu_to_csr_if.v new file mode 100644 index 00000000..6d57da1d --- /dev/null +++ b/hw/rtl/interfaces/VX_fpu_to_csr_if.v @@ -0,0 +1,23 @@ +`ifndef VX_FPU_TO_CSR_IF +`define VX_FPU_TO_CSR_IF + +`include "VX_define.vh" + +interface VX_fpu_to_csr_if (); + +`IGNORE_WARNINGS_BEGIN + wire valid; + + wire [`NW_BITS-1:0] warp_num; + + wire fflags_NV; + wire fflags_DZ; + wire fflags_OF; + wire fflags_UF; + wire fflags_NX; + +`IGNORE_WARNINGS_END + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_gpr_data_if.v b/hw/rtl/interfaces/VX_gpr_data_if.v index fb58e6f7..e3a1a311 100644 --- a/hw/rtl/interfaces/VX_gpr_data_if.v +++ b/hw/rtl/interfaces/VX_gpr_data_if.v @@ -7,6 +7,7 @@ interface VX_gpr_data_if (); wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs3_data; endinterface diff --git a/hw/rtl/interfaces/VX_lsu_req_if.v b/hw/rtl/interfaces/VX_lsu_req_if.v index ce673140..a4e0aeed 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.v +++ b/hw/rtl/interfaces/VX_lsu_req_if.v @@ -12,7 +12,7 @@ interface VX_lsu_req_if (); wire rw; wire [`BYTEEN_BITS-1:0] byteen; - wire [`WB_BITS-1:0] wb; + wire wb; wire [`NR_BITS-1:0] rd; wire [`NUM_THREADS-1:0][31:0] store_data; diff --git a/hw/rtl/interfaces/VX_mul_req_if.v b/hw/rtl/interfaces/VX_mul_req_if.v index 01c0a621..708dba86 100644 --- a/hw/rtl/interfaces/VX_mul_req_if.v +++ b/hw/rtl/interfaces/VX_mul_req_if.v @@ -11,7 +11,7 @@ interface VX_mul_req_if (); wire [`MUL_BITS-1:0] mul_op; - wire [`WB_BITS-1:0] wb; + wire wb; wire [`NR_BITS-1:0] rd; wire [`NUM_THREADS-1:0][31:0] rs1_data; diff --git a/hw/rtl/interfaces/VX_wb_if.v b/hw/rtl/interfaces/VX_wb_if.v index e363c564..9d7aaa7a 100644 --- a/hw/rtl/interfaces/VX_wb_if.v +++ b/hw/rtl/interfaces/VX_wb_if.v @@ -9,6 +9,7 @@ interface VX_wb_if (); wire [`NW_BITS-1:0] warp_num; wire [`NR_BITS-1:0] rd; wire [`NUM_THREADS-1:0][31:0] data; + wire is_fp; wire ready; endinterface diff --git a/hw/rtl/tex_unit/VX_tex_mgr.v b/hw/rtl/tex_unit/VX_tex_mgr.v index 42184037..0452e00f 100644 --- a/hw/rtl/tex_unit/VX_tex_mgr.v +++ b/hw/rtl/tex_unit/VX_tex_mgr.v @@ -2,7 +2,7 @@ module VX_tex_mgr ( input wire clk, - input wire reset, + input wire reset ); //-- diff --git a/hw/rtl/tex_unit/VX_tex_unit.v b/hw/rtl/tex_unit/VX_tex_unit.v index b7eef8d8..f400ad63 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.v +++ b/hw/rtl/tex_unit/VX_tex_unit.v @@ -11,7 +11,7 @@ module VX_tex_unit #( parameter MAXAMW = 2, parameter TAGW = 16, - parameter NUMCRQS = 32, + parameter NUMCRQS = 32 ) ( input wire clk, input wire reset, diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 213712a0..3f9f240e 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -11,7 +11,7 @@ double sc_time_stamp() { Simulator::Simulator() { // force random values for unitialized signals - Verilated::randReset(2); + Verilated::randReset(1); // Turn off assertion before reset Verilated::assertOn(false); @@ -105,9 +105,8 @@ void Simulator::eval_dram_bus() { if (!dram_rsp_active_) { if (dequeue_index != -1) { vortex_->dram_rsp_valid = 1; - memcpy((uint8_t*)vortex_->dram_rsp_data, dram_rsp_vec_[dequeue_index].data, GLOBAL_BLOCK_SIZE); - vortex_->dram_rsp_tag = dram_rsp_vec_[dequeue_index].tag; - free(dram_rsp_vec_[dequeue_index].data); + memcpy((uint8_t*)vortex_->dram_rsp_data, dram_rsp_vec_[dequeue_index].block.data(), GLOBAL_BLOCK_SIZE); + vortex_->dram_rsp_tag = dram_rsp_vec_[dequeue_index].tag; dram_rsp_vec_.erase(dram_rsp_vec_.begin() + dequeue_index); dram_rsp_active_ = true; } else { @@ -141,9 +140,8 @@ void Simulator::eval_dram_bus() { } else { dram_req_t dram_req; dram_req.cycles_left = DRAM_LATENCY; - dram_req.data = (uint8_t*)malloc(GLOBAL_BLOCK_SIZE); dram_req.tag = vortex_->dram_req_tag; - ram_->read(vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE, GLOBAL_BLOCK_SIZE, dram_req.data); + ram_->read(vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE, GLOBAL_BLOCK_SIZE, dram_req.block.data()); dram_rsp_vec_.push_back(dram_req); } } diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index 297e2121..7fb6370a 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -21,7 +21,7 @@ typedef struct { int cycles_left; - uint8_t *data; + std::array block; unsigned tag; } dram_req_t; diff --git a/hw/simulate/verilator.vlt b/hw/simulate/verilator.vlt new file mode 100644 index 00000000..7e8fdd49 --- /dev/null +++ b/hw/simulate/verilator.vlt @@ -0,0 +1,11 @@ +`verilator_config + +lint_off -rule BLKANDNBLK -file "../rtl/fp_cores/fpnew/*" +lint_off -rule UNOPTFLAT -file "../rtl/fp_cores/fpnew/*" +lint_off -rule WIDTH -file "../rtl/fp_cores/fpnew/*" +lint_off -rule UNUSED -file "../rtl/fp_cores/fpnew/*" +lint_off -rule LITENDIAN -file "../rtl/fp_cores/fpnew/*" +lint_off -rule IMPORTSTAR -file "../rtl/fp_cores/fpnew/*" +lint_off -rule PINCONNECTEMPTY -file "../rtl/fp_cores/fpnew/*" + +//lint_off -rule CASEINCOMPLETE -file "../rtl/fp_cores/fpnew/*" diff --git a/hw/syn/yosys/synth.sh b/hw/syn/yosys/synth.sh index 621866f1..544bbad0 100755 --- a/hw/syn/yosys/synth.sh +++ b/hw/syn/yosys/synth.sh @@ -12,7 +12,7 @@ echo "inc_list=$inc_list" { # read design sources for dir in $dir_list; do - for file in $(find $dir -name '*.v' -o -name '*.sv' -type f) + for file in $(find $dir -maxdepth 1 -name '*.v' -o -name '*.sv' -type f) do echo "read_verilog -sv $inc_list $file" done