diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 60061f46..38fe3d60 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -16,12 +16,12 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO -CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 +#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1 +CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1 -#DEBUG=1 +DEBUG=1 #AFU=1 CFLAGS += -fPIC diff --git a/driver/rtlsim/simulator.cpp b/driver/rtlsim/simulator.cpp index a0b3ad5d..6ff72603 100644 --- a/driver/rtlsim/simulator.cpp +++ b/driver/rtlsim/simulator.cpp @@ -19,6 +19,7 @@ Simulator::Simulator() { #ifdef VCD_OUTPUT Verilated::traceEverOn(true); trace_ = new VerilatedVcdC; + trace_->set_time_unit("1ns"); vortex_->trace(trace_, 99); trace_->open("trace.vcd"); #endif diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 5f13fd77..278ee190 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -1017,8 +1017,8 @@ localparam SCOPE_SR_DEPTH = 2; wire scope_changed = (scope_icache_req_valid && scope_icache_req_ready) || (scope_icache_rsp_valid && scope_icache_rsp_ready) - || ((| scope_dcache_req_valid) && scope_dcache_req_ready) - || ((| scope_dcache_rsp_valid) && scope_dcache_rsp_ready) + || (scope_dcache_req_valid && scope_dcache_req_ready) + || (scope_dcache_rsp_valid && scope_dcache_rsp_ready) || (scope_dram_req_valid && scope_dram_req_ready) || (scope_dram_rsp_valid && scope_dram_rsp_ready) || (scope_snp_req_valid && scope_snp_req_ready) diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 15d029fe..13e7b2a1 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -53,7 +53,7 @@ module VX_alu_unit #( VX_priority_encoder #( .N(`NUM_THREADS) ) choose_alu_result ( - .data_in (alu_req_if.valid), + .data_in (alu_req_if.thread_mask), .data_out (br_result_index), `UNUSED_PIN (valid_out) ); @@ -81,11 +81,11 @@ module VX_alu_unit #( wire [31:0] br_dest = $signed(br_addr) + $signed(alu_req_if.offset); wire is_jal = (alu_op == `ALU_JAL || alu_op == `ALU_JALR); - wire is_br_valid = `IS_BR_OP(alu_op) && (| alu_req_if.valid); + wire is_br_valid = `IS_BR_OP(alu_op) && alu_req_if.valid; wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result; - wire stall = ~alu_commit_if.ready && (| alu_commit_if.valid); + wire stall = ~alu_commit_if.ready && alu_commit_if.valid; VX_generic_register #( .N(1 + `NW_BITS + 1 + 32) @@ -99,14 +99,14 @@ module VX_alu_unit #( ); VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)) + .N(1 + `ISTAG_BITS + (`NUM_THREADS * 32)) ) alu_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({alu_req_if.valid, alu_req_if.warp_num, alu_req_if.curr_PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result}), - .out ({alu_commit_if.valid, alu_commit_if.warp_num, alu_commit_if.curr_PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data}) + .in ({alu_req_if.valid, alu_req_if.issue_tag, alu_jal_result}), + .out ({alu_commit_if.valid, alu_commit_if.issue_tag, alu_commit_if.data}) ); assign alu_req_if.ready = ~stall; diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index c4e4d294..f8f842db 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -11,21 +11,22 @@ module VX_commit #( VX_commit_if lsu_commit_if, VX_commit_if mul_commit_if, VX_commit_if csr_commit_if, - VX_commit_fp_if fpu_commit_if, + VX_commit_if fpu_commit_if, VX_commit_if gpu_commit_if, // outputs + VX_commit_is_if commit_is_if, VX_wb_if writeback_if, VX_perf_cntrs_if perf_cntrs_if ); wire [`NUM_EXS-1:0] commited_mask; - assign commited_mask = {((| alu_commit_if.valid) && alu_commit_if.ready), - ((| lsu_commit_if.valid) && lsu_commit_if.ready), - ((| csr_commit_if.valid) && csr_commit_if.ready), - ((| mul_commit_if.valid) && mul_commit_if.ready), - ((| fpu_commit_if.valid) && fpu_commit_if.ready), - ((| gpu_commit_if.valid) && gpu_commit_if.ready)}; + assign commited_mask = {(alu_commit_if.valid && alu_commit_if.ready), + (lsu_commit_if.valid && lsu_commit_if.ready), + (csr_commit_if.valid && csr_commit_if.ready), + (mul_commit_if.valid && mul_commit_if.ready), + (fpu_commit_if.valid && fpu_commit_if.ready), + (gpu_commit_if.valid && gpu_commit_if.ready)}; wire [`NE_BITS:0] num_commits; @@ -55,6 +56,20 @@ module VX_commit #( assign perf_cntrs_if.total_cycles = total_cycles; assign perf_cntrs_if.total_instrs = total_instrs; + assign commit_is_if.alu_valid = alu_commit_if.valid && alu_commit_if.ready; + assign commit_is_if.lsu_valid = lsu_commit_if.valid && lsu_commit_if.ready; + assign commit_is_if.csr_valid = csr_commit_if.valid && csr_commit_if.ready; + assign commit_is_if.mul_valid = mul_commit_if.valid && mul_commit_if.ready; + assign commit_is_if.fpu_valid = fpu_commit_if.valid && fpu_commit_if.ready; + assign commit_is_if.gpu_valid = gpu_commit_if.valid && gpu_commit_if.ready; + + assign commit_is_if.alu_tag = alu_commit_if.issue_tag; + assign commit_is_if.lsu_tag = lsu_commit_if.issue_tag; + assign commit_is_if.csr_tag = csr_commit_if.issue_tag; + assign commit_is_if.mul_tag = mul_commit_if.issue_tag; + assign commit_is_if.fpu_tag = fpu_commit_if.issue_tag; + assign commit_is_if.gpu_tag = gpu_commit_if.issue_tag; + assign gpu_commit_if.ready = 1'b1; // doesn't writeback VX_writeback #( @@ -67,30 +82,31 @@ module VX_commit #( .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), - .fpu_commit_if (fpu_commit_if), + .fpu_commit_if (fpu_commit_if), + .commit_is_if (commit_is_if), .writeback_if (writeback_if) - ); + ); `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if ((| alu_commit_if.valid) && alu_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=ALU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, alu_commit_if.warp_num, alu_commit_if.curr_PC, alu_commit_if.wb, alu_commit_if.rd, alu_commit_if.data); + if (alu_commit_if.valid && alu_commit_if.ready) begin + $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, commit_is_if.alu_data.warp_num, commit_is_if.alu_data.curr_PC, alu_commit_if.issue_tag, commit_is_if.alu_data.thread_mask, commit_is_if.alu_data.wb, commit_is_if.alu_data.rd, alu_commit_if.data); end - if ((| lsu_commit_if.valid) && lsu_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=LSU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, lsu_commit_if.warp_num, lsu_commit_if.curr_PC, lsu_commit_if.wb, lsu_commit_if.rd, lsu_commit_if.data); + if (lsu_commit_if.valid && lsu_commit_if.ready) begin + $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, commit_is_if.lsu_data.warp_num, commit_is_if.lsu_data.curr_PC, lsu_commit_if.issue_tag, commit_is_if.lsu_data.thread_mask, commit_is_if.lsu_data.wb, commit_is_if.lsu_data.rd, lsu_commit_if.data); end - if ((| csr_commit_if.valid) && csr_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=CSR, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, csr_commit_if.warp_num, csr_commit_if.curr_PC, csr_commit_if.wb, csr_commit_if.rd, csr_commit_if.data); + if (csr_commit_if.valid && csr_commit_if.ready) begin + $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, commit_is_if.csr_data.warp_num, commit_is_if.csr_data.curr_PC, csr_commit_if.issue_tag, commit_is_if.csr_data.thread_mask, commit_is_if.csr_data.wb, commit_is_if.csr_data.rd, csr_commit_if.data); end - if ((| mul_commit_if.valid) && mul_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=MUL, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, mul_commit_if.warp_num, mul_commit_if.curr_PC, mul_commit_if.wb, mul_commit_if.rd, mul_commit_if.data); + if (mul_commit_if.valid && mul_commit_if.ready) begin + $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, commit_is_if.mul_data.warp_num, commit_is_if.mul_data.curr_PC, mul_commit_if.issue_tag, commit_is_if.mul_data.thread_mask, commit_is_if.mul_data.wb, commit_is_if.mul_data.rd, mul_commit_if.data); end - if ((| fpu_commit_if.valid) && fpu_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=FPU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, fpu_commit_if.warp_num, fpu_commit_if.curr_PC, fpu_commit_if.wb, fpu_commit_if.rd, fpu_commit_if.data); + if (fpu_commit_if.valid && fpu_commit_if.ready) begin + $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, commit_is_if.fpu_data.warp_num, commit_is_if.fpu_data.curr_PC, fpu_commit_if.issue_tag, commit_is_if.fpu_data.thread_mask, commit_is_if.fpu_data.wb, commit_is_if.fpu_data.rd, fpu_commit_if.data); end - if ((| gpu_commit_if.valid) && gpu_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=GPU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, gpu_commit_if.warp_num, gpu_commit_if.curr_PC, gpu_commit_if.wb, gpu_commit_if.rd, gpu_commit_if.data); + if (gpu_commit_if.valid && gpu_commit_if.ready) begin + $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, commit_is_if.gpu_data.warp_num, commit_is_if.gpu_data.curr_PC, gpu_commit_if.issue_tag, commit_is_if.gpu_data.thread_mask, commit_is_if.gpu_data.wb, commit_is_if.gpu_data.rd, gpu_commit_if.data); end end `endif diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 95d23af3..bba64e50 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -39,10 +39,6 @@ `define SHARED_MEM_BASE_ADDR 32'h6FFFF000 `endif -`ifndef STACK_BASE_ADDR -`define STACK_BASE_ADDR 20'h6FFFF -`endif - `ifndef IO_BUS_BASE_ADDR `define IO_BUS_BASE_ADDR 32'hFFFFFF00 `endif @@ -59,13 +55,9 @@ `define L3_ENABLE (`NUM_CLUSTERS > 1) `endif -`ifndef EXT_M_ENABLE -`define EXT_M_ENABLE 1 -`endif +`define EXT_M_ENABLE -`ifndef EXT_F_ENABLE -`define EXT_F_ENABLE 1 -`endif +// define EXT_F_ENABLE // Configuration Values ======================================================= @@ -109,6 +101,11 @@ `define FPURQ_SIZE 8 `endif +// Size of issue queue +`ifndef ISSUEQ_SIZE +`define ISSUEQ_SIZE (8 + `NUM_WARPS) +`endif + // Dcache Configurable Knobs ================================================== // Size of cache in bytes @@ -148,12 +145,12 @@ // Dram Fill Rsp Queue Size `ifndef DDFPQ_SIZE -`define DDFPQ_SIZE 16 +`define DDFPQ_SIZE 8 `endif // Snoop Req Queue Size `ifndef DSNRQ_SIZE -`define DSNRQ_SIZE 16 +`define DSNRQ_SIZE 8 `endif // Core Writeback Queue Size @@ -173,7 +170,7 @@ // Prefetcher `ifndef DPRFQ_SIZE -`define DPRFQ_SIZE 16 +`define DPRFQ_SIZE 8 `endif `ifndef DPRFQ_STRIDE @@ -219,7 +216,7 @@ // Dram Fill Rsp Queue Size `ifndef IDFPQ_SIZE -`define IDFPQ_SIZE 16 +`define IDFPQ_SIZE 8 `endif // Core Writeback Queue Size @@ -229,7 +226,7 @@ // Dram Writeback Queue Size `ifndef IDWBQ_SIZE -`define IDWBQ_SIZE 16 +`define IDWBQ_SIZE 8 `endif // Dram Fill Req Queue Size @@ -239,7 +236,7 @@ // Prefetcher `ifndef IPRFQ_SIZE -`define IPRFQ_SIZE 16 +`define IPRFQ_SIZE 8 `endif `ifndef IPRFQ_STRIDE @@ -312,7 +309,7 @@ // Core Request Queue Size `ifndef L2CREQ_SIZE -`define L2CREQ_SIZE 16 +`define L2CREQ_SIZE 8 `endif // Miss Reserv Queue Knob @@ -322,12 +319,12 @@ // Dram Fill Rsp Queue Size `ifndef L2DFPQ_SIZE -`define L2DFPQ_SIZE 16 +`define L2DFPQ_SIZE 8 `endif // Snoop Req Queue Size `ifndef L2SNRQ_SIZE -`define L2SNRQ_SIZE 16 +`define L2SNRQ_SIZE 8 `endif // Core Writeback Queue Size @@ -337,7 +334,7 @@ // Dram Writeback Queue Size `ifndef L2DWBQ_SIZE -`define L2DWBQ_SIZE 16 +`define L2DWBQ_SIZE 8 `endif // Dram Fill Req Queue Size @@ -347,7 +344,7 @@ // Prefetcher `ifndef L2PRFQ_SIZE -`define L2PRFQ_SIZE 16 +`define L2PRFQ_SIZE 8 `endif `ifndef L2PRFQ_STRIDE @@ -383,7 +380,7 @@ // Core Request Queue Size `ifndef L3CREQ_SIZE -`define L3CREQ_SIZE 16 +`define L3CREQ_SIZE 8 `endif // Miss Reserv Queue Knob @@ -393,12 +390,12 @@ // Dram Fill Rsp Queue Size `ifndef L3DFPQ_SIZE -`define L3DFPQ_SIZE 16 +`define L3DFPQ_SIZE 8 `endif // Snoop Req Queue Size `ifndef L3SNRQ_SIZE -`define L3SNRQ_SIZE 16 +`define L3SNRQ_SIZE 8 `endif // Core Writeback Queue Size @@ -408,7 +405,7 @@ // Dram Writeback Queue Size `ifndef L3DWBQ_SIZE -`define L3DWBQ_SIZE 16 +`define L3DWBQ_SIZE 8 `endif // Dram Fill Req Queue Size @@ -418,7 +415,7 @@ // Prefetcher `ifndef L3PRFQ_SIZE -`define L3PRFQ_SIZE 16 +`define L3PRFQ_SIZE 8 `endif `ifndef L3PRFQ_STRIDE diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index 9695526b..8e4fff92 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -166,15 +166,15 @@ module VX_core #( VX_cache_core_req_if #( .NUM_REQUESTS(`INUM_REQUESTS), .WORD_SIZE(`IWORD_SIZE), - .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH), - .CORE_TAG_ID_BITS(`DCORE_TAG_ID_BITS) + .CORE_TAG_WIDTH(`ICORE_TAG_WIDTH), + .CORE_TAG_ID_BITS(`ICORE_TAG_ID_BITS) ) core_icache_req_if(); VX_cache_core_rsp_if #( .NUM_REQUESTS(`INUM_REQUESTS), .WORD_SIZE(`IWORD_SIZE), - .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH), - .CORE_TAG_ID_BITS(`DCORE_TAG_ID_BITS) + .CORE_TAG_WIDTH(`ICORE_TAG_WIDTH), + .CORE_TAG_ID_BITS(`ICORE_TAG_ID_BITS) ) core_icache_rsp_if(); VX_pipeline #( diff --git a/hw/rtl/VX_csr_arb.v b/hw/rtl/VX_csr_arb.v index 48dc6124..3feb03cc 100644 --- a/hw/rtl/VX_csr_arb.v +++ b/hw/rtl/VX_csr_arb.v @@ -26,7 +26,8 @@ module VX_csr_arb ( `UNUSED_VAR (reset) // requests - assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}}; + assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : csr_io_req_if.valid; + assign csr_req_if.issue_tag = (~select_io_req) ? csr_core_req_if.issue_tag : 0; assign csr_req_if.warp_num = (~select_io_req) ? csr_core_req_if.warp_num : 0; assign csr_req_if.curr_PC = (~select_io_req) ? csr_core_req_if.curr_PC : 0; assign csr_req_if.csr_op = (~select_io_req) ? csr_core_req_if.csr_op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS); @@ -40,15 +41,12 @@ module VX_csr_arb ( assign csr_io_req_if.ready = csr_req_if.ready && select_io_req; // responses - assign csr_io_rsp_if.valid = csr_rsp_if.valid[0] & select_io_rsp; + assign csr_io_rsp_if.valid = csr_rsp_if.valid & select_io_rsp; assign csr_io_rsp_if.data = csr_rsp_if.data[0]; - assign csr_commit_if.valid = csr_rsp_if.valid & {`NUM_THREADS{~select_io_rsp}}; - assign csr_commit_if.warp_num = csr_rsp_if.warp_num; - assign csr_commit_if.curr_PC = csr_rsp_if.curr_PC; - assign csr_commit_if.data = csr_rsp_if.data; - assign csr_commit_if.rd = csr_rsp_if.rd; - assign csr_commit_if.wb = csr_rsp_if.wb; + assign csr_commit_if.valid = csr_rsp_if.valid & ~select_io_rsp; + assign csr_commit_if.issue_tag= csr_rsp_if.issue_tag; + assign csr_commit_if.data = csr_rsp_if.data; assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : csr_commit_if.ready; diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 96a7973f..5e94c951 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -20,7 +20,7 @@ module VX_csr_unit #( VX_csr_req_if csr_pipe_req_if(); VX_commit_if csr_pipe_commit_if(); - wire select_io_req = (| csr_io_req_if.valid); + wire select_io_req = csr_io_req_if.valid; wire select_io_rsp; VX_csr_arb csr_arb ( @@ -44,7 +44,7 @@ module VX_csr_unit #( wire [31:0] csr_updated_data_s2; wire [31:0] csr_read_data_unqual; - wire is_csr_s2 = (| csr_pipe_commit_if.valid); + wire is_csr_s2 = csr_pipe_commit_if.valid; VX_csr_data #( .CORE_ID(CORE_ID) @@ -62,8 +62,10 @@ module VX_csr_unit #( .warp_num (csr_pipe_req_if.warp_num) ); + wire [`NW_BITS-1:0] warp_num_s2; + wire csr_hazard = (csr_addr_s2 == csr_pipe_req_if.csr_addr) - && (csr_pipe_commit_if.warp_num == csr_pipe_req_if.warp_num) + && (warp_num_s2 == csr_pipe_req_if.warp_num) && is_csr_s2; wire [31:0] csr_read_data = csr_hazard ? csr_updated_data_s2 : csr_read_data_unqual; @@ -79,24 +81,24 @@ module VX_csr_unit #( endcase end - wire stall = ~csr_pipe_commit_if.ready && (| csr_pipe_commit_if.valid); + wire stall = ~csr_pipe_commit_if.ready && csr_pipe_commit_if.valid; VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + `CSR_ADDR_SIZE + 1 + 32 + 32) + .N(1 + `ISTAG_BITS + `NW_BITS + `CSR_ADDR_SIZE + 1 + 32 + 32) ) csr_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({csr_pipe_req_if.valid, csr_pipe_req_if.warp_num, csr_pipe_req_if.curr_PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data, csr_updated_data}), - .out ({csr_pipe_commit_if.valid, csr_pipe_commit_if.warp_num, csr_pipe_commit_if.curr_PC, csr_pipe_commit_if.rd, csr_pipe_commit_if.wb, csr_addr_s2, select_io_rsp, csr_read_data_s2, csr_updated_data_s2}) + .in ({csr_pipe_req_if.valid, csr_pipe_req_if.issue_tag, csr_pipe_req_if.warp_num, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data, csr_updated_data}), + .out ({csr_pipe_commit_if.valid, csr_pipe_commit_if.issue_tag, warp_num_s2, csr_addr_s2, select_io_rsp, csr_read_data_s2, csr_updated_data_s2}) ); genvar i; for (i = 0; i < `NUM_THREADS; i++) begin assign csr_pipe_commit_if.data[i] = (csr_addr_s2 == `CSR_LTID) ? i : - (csr_addr_s2 == `CSR_GTID) ? (csr_read_data_s2 * `NUM_THREADS + i) : - csr_read_data_s2; + (csr_addr_s2 == `CSR_GTID) ? (csr_read_data_s2 * `NUM_THREADS + i) : + csr_read_data_s2; end assign csr_pipe_req_if.ready = ~stall; diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 96b39503..61e50417 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -15,7 +15,7 @@ module VX_decode #( VX_wstall_if wstall_if, VX_join_if join_if ); - wire in_valid = (| ifetch_rsp_if.valid); + wire in_valid = ifetch_rsp_if.valid; wire [31:0] instr = ifetch_rsp_if.instr; reg [`ALU_BITS-1:0] alu_op; @@ -167,9 +167,8 @@ module VX_decode #( end // MUL - +`ifdef EXT_M_ENABLE wire is_mul = is_rtype && (func7 == 7'h1); - always @(*) begin mul_op = `MUL_MUL; case (func3) @@ -184,9 +183,15 @@ module VX_decode #( default:; endcase end +`else + wire is_mul = 0; + always @(*) begin + mul_op = `MUL_MUL; + end +`endif // FPU - +`ifdef EXT_F_ENABLE wire is_fl = (opcode == `INST_FL) && ((func3 == 2)); wire is_fs = (opcode == `INST_FS) && ((func3 == 2)); wire is_fci = (opcode == `INST_FCI); @@ -226,6 +231,15 @@ module VX_decode #( endcase end end +`else + wire is_fs = 0; + wire is_fci = 0; + wire is_fr4 = 0; + wire is_fpu = 0; + always @(*) begin + fpu_op = `FPU_OTHER; + end +`endif // GPU @@ -243,10 +257,11 @@ module VX_decode #( VX_decode_if decode_tmp_if(); - assign decode_tmp_if.valid = ifetch_rsp_if.valid; - assign decode_tmp_if.warp_num = ifetch_rsp_if.warp_num; - assign decode_tmp_if.curr_PC = ifetch_rsp_if.curr_PC; - assign decode_tmp_if.next_PC = ifetch_rsp_if.curr_PC + 32'h4; + assign decode_tmp_if.valid = ifetch_rsp_if.valid; + assign decode_tmp_if.warp_num = ifetch_rsp_if.warp_num; + assign decode_tmp_if.thread_mask= ifetch_rsp_if.thread_mask; + assign decode_tmp_if.curr_PC = ifetch_rsp_if.curr_PC; + assign decode_tmp_if.next_PC = ifetch_rsp_if.curr_PC + 32'h4; assign decode_tmp_if.ex_type = is_lsu ? `EX_LSU : is_csr ? `EX_CSR : @@ -299,29 +314,29 @@ module VX_decode #( assign wstall_if.wstall = in_valid && (is_btype || is_jal || is_jalr || (is_gpu && (gpu_op == `GPU_TMC || gpu_op == `GPU_SPLIT || gpu_op == `GPU_BAR))); assign wstall_if.warp_num = ifetch_rsp_if.warp_num; - wire stall = ~decode_if.ready && (| decode_if.valid); + wire stall = ~decode_if.ready && decode_if.valid; VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + `FRM_BITS) + .N(1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + 1 + `FRM_BITS) ) decode_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.instr_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.frm}), - .out ({decode_if.valid, decode_if.warp_num, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.instr_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.frm}) + .in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.instr_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.rd_is_fp, decode_tmp_if.frm}), + .out ({decode_if.valid, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.instr_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.rd_is_fp, decode_if.frm}) ); assign ifetch_rsp_if.ready = ~stall; `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if ((| decode_tmp_if.valid) && ~stall) begin + if (decode_tmp_if.valid && ~stall) begin $write("%t: Core%0d-Decode: warp=%0d, PC=%0h, ex=", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC); print_ex_type(decode_tmp_if.ex_type); $write(", op="); print_instr_op(decode_tmp_if.ex_type, decode_tmp_if.instr_op); - $write(", wb=%b, rd=%0d, rs1=%0d, rs2=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b\n", decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2); + $write(", tmask=%b, wb=%b, rd=%0d, rd_is_fp=%b, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b, use_rs3=%b\n", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rd_is_fp, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.use_rs3); // trap unsupported instructions assert(~(~stall && (decode_tmp_if.ex_type == `EX_ALU) && `ALU_OP(decode_tmp_if.instr_op) == `ALU_OTHER)); diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 841df9a3..04562647 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -27,12 +27,14 @@ /* verilator lint_off PINCONNECTEMPTY */ \ /* verilator lint_off WIDTH */ \ /* verilator lint_off UNOPTFLAT */ \ + /* verilator lint_off UNDRIVEN */ \ /* verilator lint_off DECLFILENAME */ `define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \ /* verilator lint_on PINCONNECTEMPTY */ \ /* verilator lint_on WIDTH */ \ /* verilator lint_on UNOPTFLAT */ \ + /* verilator lint_on UNDRIVEN */ \ /* verilator lint_on DECLFILENAME */ `define UNUSED_VAR(x) /* verilator lint_off UNUSED */ \ @@ -50,6 +52,9 @@ if (!(cond)) $error(msg); \ endgenerate +`define ENABLE_TRACING /* verilator tracing_on */ +`define DISABLE_TRACING /* verilator tracing_off */ + `define CLOG2(x) $clog2(x) `define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > (x)) ? 1 : 0)) `define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1) @@ -80,8 +85,9 @@ `define CSR_WIDTH 12 -`define LATENCY_IDIV 21 +`define ISTAG_BITS `LOG2UP(`ISSUEQ_SIZE) +`define LATENCY_IDIV 21 `define LATENCY_IMUL 2 `define LATENCY_FMULADD 2 @@ -259,19 +265,31 @@ /////////////////////////////////////////////////////////////////////////////// +`ifdef EXT_M_ENABLE + `define ISA_EXT_M (1 << 12) +`else + `define ISA_EXT_M 0 +`endif + +`ifdef EXT_F_ENABLE + `define ISA_EXT_F (1 << 5) +`else + `define ISA_EXT_F 0 +`endif + `define ISA_CODE (0 << 0) // A - Atomic Instructions extension \ | (0 << 1) // B - Tentatively reserved for Bit operations extension \ | (0 << 2) // C - Compressed extension \ | (0 << 3) // D - Double precsision floating-point extension \ | (0 << 4) // E - RV32E base ISA \ - | (`EXT_F_ENABLE << 5) // F - Single precsision floating-point extension \ + |`ISA_EXT_F // F - Single precsision floating-point extension \ | (0 << 6) // G - Additional standard extensions present \ | (0 << 7) // H - Hypervisor mode implemented \ | (1 << 8) // I - RV32I/64I/128I base ISA \ | (0 << 9) // J - Reserved \ | (0 << 10) // K - Reserved \ | (0 << 11) // L - Tentatively reserved for Bit operations extension \ - | (`EXT_M_ENABLE << 12) // M - Integer Multiply/Divide extension \ + |`ISA_EXT_M // M - Integer Multiply/Divide extension \ | (0 << 13) // N - User level interrupts supported \ | (0 << 14) // O - Reserved \ | (0 << 15) // P - Tentatively reserved for Packed-SIMD extension \ @@ -288,7 +306,7 @@ /////////////////////////////////////////////////////////////////////////////// -`ifdef DBG_CORE_REQ_INFO // pc, wb, rd, warp_num +`ifdef DBG_CORE_REQ_INFO // pc, wb, rd, warp_num `define DEBUG_CORE_REQ_MDATA_WIDTH (32 + 1 + `NR_BITS + `NW_BITS) `else `define DEBUG_CORE_REQ_MDATA_WIDTH 0 @@ -300,7 +318,7 @@ `define DCACHE_ID (((`L3_ENABLE && `L2_ENABLE) ? 2 : `L2_ENABLE ? 1 : 0) + (CORE_ID * 3) + 0) // TAG sharing enable -`define DCORE_TAG_ID_BITS `LOG2UP(`DCREQ_SIZE) +`define DCORE_TAG_ID_BITS `ISTAG_BITS // Core request tag bits `define DCORE_TAG_WIDTH (`DEBUG_CORE_REQ_MDATA_WIDTH + `DCORE_TAG_ID_BITS) @@ -335,7 +353,7 @@ `define ICORE_BYTEEN_WIDTH `DWORD_SIZE // TAG sharing enable -`define ICORE_TAG_ID_BITS `LOG2UP(`ICREQ_SIZE) +`define ICORE_TAG_ID_BITS `NW_BITS // Core request tag bits `define ICORE_TAG_WIDTH (`DEBUG_CORE_REQ_MDATA_WIDTH + `ICORE_TAG_ID_BITS) @@ -438,8 +456,6 @@ /////////////////////////////////////////////////////////////////////////////// - - task print_ex_type; input [`EX_BITS-1:0] ex; begin @@ -488,20 +504,7 @@ task print_instr_op; `ALU_DRET: $write("DRET"); default: $write("?"); endcase - end - `EX_MUL: begin - case (`MUL_BITS'(op)) - `MUL_MUL: $write("MUL"); - `MUL_MULH: $write("MULH"); - `MUL_MULHSU:$write("MULHSU"); - `MUL_MULHU: $write("MULHU"); - `MUL_DIV: $write("DIV"); - `MUL_DIVU: $write("DIVU"); - `MUL_REM: $write("REM"); - `MUL_REMU: $write("REMU"); - default: $write("?"); - endcase - end + end `EX_LSU: begin case (`LSU_BITS'(op)) `LSU_LB: $write("LB"); @@ -525,6 +528,45 @@ task print_instr_op; default: $write("?"); endcase end + `EX_MUL: begin + case (`MUL_BITS'(op)) + `MUL_MUL: $write("MUL"); + `MUL_MULH: $write("MULH"); + `MUL_MULHSU:$write("MULHSU"); + `MUL_MULHU: $write("MULHU"); + `MUL_DIV: $write("DIV"); + `MUL_DIVU: $write("DIVU"); + `MUL_REM: $write("REM"); + `MUL_REMU: $write("REMU"); + default: $write("?"); + endcase + end + `EX_FPU: begin + case (`FPU_BITS'(op)) + `FPU_ADD: $write("ADD"); + `FPU_SUB: $write("SUB"); + `FPU_MUL: $write("MUL"); + `FPU_DIV: $write("DIV"); + `FPU_SQRT: $write("SQRT"); + `FPU_MADD: $write("MADD"); + `FPU_NMSUB: $write("NMSUB"); + `FPU_NMADD: $write("NMADD"); + `FPU_SGNJ: $write("SGNJ"); + `FPU_SGNJN: $write("SGNJN"); + `FPU_SGNJX: $write("SGNJX"); + `FPU_MIN: $write("MIN"); + `FPU_MAX: $write("MAX"); + `FPU_CVTWS: $write("CVTWS"); + `FPU_CVTWUS:$write("CVTWUS"); + `FPU_CVTSW: $write("CVTSW"); + `FPU_CVTSWU:$write("CVTSWU"); + `FPU_MVXW: $write("MVXW"); + `FPU_MVWX: $write("MVWX"); + `FPU_CLASS: $write("CLASS"); + `FPU_CMP: $write("CMP"); + default: $write("?"); + endcase + end `EX_GPU: begin case (`GPU_BITS'(op)) `GPU_TMC: $write("TMC"); diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 6f9716b3..71bdba37 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -35,7 +35,7 @@ module VX_execute #( VX_commit_if lsu_commit_if, VX_commit_if csr_commit_if, VX_commit_if mul_commit_if, - VX_commit_fp_if fpu_commit_if, + VX_commit_if fpu_commit_if, VX_commit_if gpu_commit_if, output wire ebreak @@ -79,15 +79,21 @@ module VX_execute #( .csr_commit_if (csr_commit_if) ); +`ifdef EXT_M_ENABLE VX_mul_unit #( .CORE_ID(CORE_ID) ) mul_unit ( .clk (clk), .reset (reset), - .mul_req_if (mul_req_if), - .mul_commit_if (mul_commit_if) + .alu_req_if (mul_req_if), + .alu_commit_if (mul_commit_if) ); +`else + assign mul_req_if.ready = 0; + assign mul_commit_if.valid = 0; +`endif +`ifdef EXT_F_ENABLE VX_fpu_unit #( .CORE_ID(CORE_ID) ) fpu_unit ( @@ -98,6 +104,11 @@ module VX_execute #( .fpu_to_csr_if (fpu_to_csr_if), .fpu_commit_if (fpu_commit_if) ); +`else + assign fpu_req_if.ready = 0; + assign fpu_commit_if.valid = 0; + assign fpu_to_csr_if.valid = 0; +`endif VX_gpu_unit #( .CORE_ID(CORE_ID) @@ -107,7 +118,7 @@ module VX_execute #( .gpu_commit_if (gpu_commit_if) ); - assign ebreak = (| alu_req_if.valid) && (alu_req_if.alu_op == `ALU_EBREAK || alu_req_if.alu_op == `ALU_ECALL); + assign ebreak = alu_req_if.valid && (alu_req_if.alu_op == `ALU_EBREAK || alu_req_if.alu_op == `ALU_ECALL); `SCOPE_ASSIGN(scope_decode_valid, decode_if.valid); `SCOPE_ASSIGN(scope_decode_warp_num, decode_if.warp_num); diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index 4b22850b..f0622c4f 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -14,7 +14,7 @@ module VX_fpu_unit #( VX_fpu_from_csr_if fpu_from_csr_if, // outputs - VX_commit_fp_if fpu_commit_if, + VX_commit_if fpu_commit_if, VX_fpu_to_csr_if fpu_to_csr_if ); localparam FOP_BITS = fpnew_pkg::OP_BITS; @@ -98,6 +98,8 @@ module VX_fpu_unit #( assign fpu_operands = {fpu_req_if.rs3_data, fpu_req_if.rs2_data, fpu_req_if.rs1_data}; +`DISABLE_TRACING + fpnew_top #( .Features (FPU_FEATURES), .Implementation (FPU_IMPLEMENTATION), @@ -125,47 +127,28 @@ module VX_fpu_unit #( `UNUSED_PIN (busy_o) ); - wire req_push = fpu_req_if.valid && fpu_req_if.ready; - wire req_pop = fpu_out_valid && fpu_out_ready; - wire req_full; +`ENABLE_TRACING - wire [`NUM_THREADS-1:0] rsp_valid; - wire [`NW_BITS-1:0] rsp_warp_num; - wire [31:0] rsp_curr_PC; - wire rsp_wb; - wire [`NR_BITS-1:0] rsp_rd; - wire rsp_rd_is_fp; + reg [`NW_BITS-1:0] rsp_warp_num_buf [`ISSUEQ_SIZE]; - VX_index_queue #( - .DATAW (`NUM_THREADS + `NW_BITS + 32 + 1 + `NR_BITS + 1), - .SIZE (`FPURQ_SIZE) - ) fpu_req_queue ( - .clk (clk), - .reset (reset), - .write_data ({fpu_req_if.valid, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.wb, fpu_req_if.rd, fpu_req_if.rd_is_fp}), - .write_addr (fpu_in_tag), - .push (req_push), - .full (req_full), - .pop (req_pop), - .read_addr (fpu_out_tag), - .read_data ({rsp_valid, rsp_warp_num, rsp_curr_PC, rsp_wb, rsp_rd, rsp_rd_is_fp}), - `UNUSED_PIN (empty) - ); + assign fpu_in_valid = fpu_req_if.valid; + assign fpu_in_tag = fpu_req_if.issue_tag; - assign fpu_in_valid = (| fpu_req_if.valid) && ~req_full; - assign fpu_req_if.ready = fpu_in_ready && ~req_full; + always @(posedge clk) begin + if (fpu_req_if.valid && fpu_req_if.ready) begin + rsp_warp_num_buf[fpu_in_tag] <= fpu_req_if.warp_num; + end + end - assign fpu_commit_if.valid = rsp_valid & {`NUM_THREADS{fpu_out_valid}}; - assign fpu_commit_if.warp_num = rsp_warp_num; - assign fpu_commit_if.curr_PC = rsp_curr_PC; - assign fpu_commit_if.data = fpu_result; - assign fpu_commit_if.wb = rsp_wb; - assign fpu_commit_if.rd = rsp_rd; - assign fpu_commit_if.rd_is_fp = rsp_rd_is_fp; - assign fpu_out_ready = fpu_commit_if.ready; + assign fpu_req_if.ready = fpu_in_ready; - assign fpu_to_csr_if.valid = fpu_out_valid; - assign fpu_to_csr_if.warp_num = rsp_warp_num; + assign fpu_commit_if.valid = fpu_out_valid; + assign fpu_commit_if.issue_tag = fpu_out_tag; + assign fpu_commit_if.data = fpu_result; + assign fpu_out_ready = fpu_commit_if.ready; + + assign fpu_to_csr_if.valid = fpu_out_valid && fpu_req_if.ready; + assign fpu_to_csr_if.warp_num = rsp_warp_num_buf[fpu_out_tag]; assign fpu_to_csr_if.fflags_NV = fpu_status.NV; assign fpu_to_csr_if.fflags_DZ = fpu_status.DZ; assign fpu_to_csr_if.fflags_OF = fpu_status.OF; diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v index 1c7407f1..46a45064 100644 --- a/hw/rtl/VX_gpr_fp_ctrl.v +++ b/hw/rtl/VX_gpr_fp_ctrl.v @@ -50,7 +50,7 @@ module VX_gpr_fp_ctrl ( if (decode_if.rs1_is_fp) begin tmp_rs1_data <= rs1_fp_data; end else begin - tmp_rs1_data <= decode_if.rs1_is_PC ? {`NUM_THREADS{decode_if.curr_PC}} : rs1_int_data; + tmp_rs1_data <= rs1_int_data; end end end @@ -63,7 +63,7 @@ module VX_gpr_fp_ctrl ( if (decode_if.rs2_is_fp) begin tmp_rs2_data <= rs2_fp_data; end else begin - tmp_rs2_data <= decode_if.rs2_is_imm ? {`NUM_THREADS{decode_if.imm}} : rs2_int_data; + tmp_rs2_data <= rs2_int_data; end end end diff --git a/hw/rtl/VX_gpr_ram.v b/hw/rtl/VX_gpr_ram.v index 8782ae7c..a9f3e50c 100644 --- a/hw/rtl/VX_gpr_ram.v +++ b/hw/rtl/VX_gpr_ram.v @@ -35,8 +35,6 @@ module VX_gpr_ram ( ram[waddr][i][3] <= wdata[i][31:24]; end end - assert(~(|we) || (waddr != 0)); // ensure r0 is never written! - assert(0 == ram[0]); end assign rs1_data = ram[rs1]; diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 37da6ed9..58885255 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -16,11 +16,10 @@ module VX_gpr_stage #( input wire schedule_delay, output wire gpr_delay ); + `UNUSED_VAR (reset) wire [`NUM_THREADS-1:0][31:0] rs1_int_data [`NUM_WARPS-1:0]; wire [`NUM_THREADS-1:0][31:0] rs2_int_data [`NUM_WARPS-1:0]; - wire [`NUM_THREADS-1:0][31:0] rs1_fp_data [`NUM_WARPS-1:0]; - wire [`NUM_THREADS-1:0][31:0] rs2_fp_data [`NUM_WARPS-1:0]; wire [`NUM_THREADS-1:0] we [`NUM_WARPS-1:0]; wire [`NR_BITS-1:0] raddr1; @@ -29,12 +28,10 @@ module VX_gpr_stage #( genvar i; for (i = 0; i < `NUM_WARPS; i++) begin - assign we[i] = writeback_if.valid & {`NUM_THREADS{(i == writeback_if.warp_num)}}; - - // Int GPRs + assign we[i] = writeback_if.thread_mask & {`NUM_THREADS{~writeback_if.rd_is_fp && (i == writeback_if.warp_num)}}; VX_gpr_ram gpr_int_ram ( .clk (clk), - .we (we[i] & {`NUM_THREADS{~writeback_if.rd_is_fp}}), + .we (we[i]), .waddr (writeback_if.rd), .wdata (writeback_if.data), .rs1 (raddr1), @@ -42,11 +39,18 @@ module VX_gpr_stage #( .rs1_data (rs1_int_data[i]), .rs2_data (rs2_int_data[i]) ); + end - // FP GPRs +`ifdef EXT_F_ENABLE + + wire [`NUM_THREADS-1:0][31:0] rs1_fp_data [`NUM_WARPS-1:0]; + wire [`NUM_THREADS-1:0][31:0] rs2_fp_data [`NUM_WARPS-1:0]; + + for (i = 0; i < `NUM_WARPS; i++) begin + assign we[i] = writeback_if.thread_mask & {`NUM_THREADS{writeback_if.rd_is_fp && (i == writeback_if.warp_num)}}; VX_gpr_ram gpr_fp_ram ( .clk (clk), - .we (we[i] & {`NUM_THREADS{writeback_if.rd_is_fp}}), + .we (we[i]), .waddr (writeback_if.rd), .wdata (writeback_if.data), .rs1 (raddr1), @@ -54,27 +58,36 @@ module VX_gpr_stage #( .rs1_data (rs1_fp_data[i]), .rs2_data (rs2_fp_data[i]) ); - - // controller for multi-cycle read - VX_gpr_fp_ctrl VX_gpr_fp_ctrl ( - .clk (clk), - .reset (reset), - - //inputs - .decode_if (decode_if), - .rs1_int_data (rs1_int_data[i]), - .rs2_int_data (rs2_int_data[i]), - .rs1_fp_data (rs1_fp_data[i]), - .rs2_fp_data (rs2_fp_data[i]), - - // outputs - .raddr1 (raddr1), - .raddr2 (raddr2), - .gpr_data_if (gpr_data_if), - .schedule_delay (schedule_delay), - .gpr_delay (gpr_delay) - ); end + + VX_gpr_fp_ctrl VX_gpr_fp_ctrl ( + .clk (clk), + .reset (reset), + + //inputs + .decode_if (decode_if), + .rs1_int_data (rs1_int_data[decode_if.warp_num]), + .rs2_int_data (rs2_int_data[decode_if.warp_num]), + .rs1_fp_data (rs1_fp_data[decode_if.warp_num]), + .rs2_fp_data (rs2_fp_data[decode_if.warp_num]), + + // outputs + .raddr1 (raddr1), + .raddr2 (raddr2), + .gpr_data_if (gpr_data_if), + .schedule_delay (schedule_delay), + .gpr_delay (gpr_delay) + ); + +`else + assign raddr1 = decode_if.rs1; + assign raddr2 = decode_if.rs2; + assign gpr_data_if.rs1_data = rs1_int_data[decode_if.warp_num]; + assign gpr_data_if.rs2_data = rs2_int_data[decode_if.warp_num]; + assign gpr_data_if.rs3_data = 0; + assign gpr_delay = 0; + `UNUSED_VAR (schedule_delay) +`endif assign writeback_if.ready = 1'b1; diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index 9730fdd2..208193d6 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -10,52 +10,53 @@ module VX_gpu_unit #( VX_warp_ctl_if warp_ctl_if, VX_commit_if gpu_commit_if ); - wire [`NUM_THREADS-1:0] curr_valids = gpu_req_if.valid; + wire is_wspawn = (gpu_req_if.gpu_op == `GPU_WSPAWN); wire is_tmc = (gpu_req_if.gpu_op == `GPU_TMC); wire is_split = (gpu_req_if.gpu_op == `GPU_SPLIT); wire is_bar = (gpu_req_if.gpu_op == `GPU_BAR); - wire [`NUM_THREADS-1:0] tmc_new_mask; + wire gpu_req_fire = gpu_req_if.valid && gpu_commit_if.ready; + + assign warp_ctl_if.warp_num = gpu_req_if.warp_num; + + // tmc genvar i; - for (i = 0; i < `NUM_THREADS; i++) begin : tmc_new_mask_init + + wire [`NUM_THREADS-1:0] tmc_new_mask; + for (i = 0; i < `NUM_THREADS; i++) begin assign tmc_new_mask[i] = (i < gpu_req_if.rs1_data[0]); - end + end + assign warp_ctl_if.change_mask = is_tmc && gpu_req_fire; + assign warp_ctl_if.thread_mask = tmc_new_mask; - wire valid_inst = (| curr_valids); - - assign warp_ctl_if.warp_num = gpu_req_if.warp_num; + // barrier - assign warp_ctl_if.change_mask = is_tmc && valid_inst; - assign warp_ctl_if.thread_mask = is_tmc ? tmc_new_mask : 0; - - assign warp_ctl_if.whalt = warp_ctl_if.change_mask && (0 == warp_ctl_if.thread_mask); - - wire wspawn = is_wspawn && valid_inst; - wire [31:0] wspawn_pc = gpu_req_if.rs2_data; - wire [`NUM_WARPS-1:0] wspawn_new_active; - - for (i = 0; i < `NUM_WARPS; i++) begin : wspawn_new_active_init - assign wspawn_new_active[i] = (i < gpu_req_if.rs1_data[0]); - end - - assign warp_ctl_if.is_barrier = is_bar && valid_inst; + assign warp_ctl_if.is_barrier = is_bar && gpu_req_fire; assign warp_ctl_if.barrier_id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; + assign warp_ctl_if.barrier_num_warps = (`NW_BITS+1)'(gpu_req_if.rs2_data - 1); - assign warp_ctl_if.num_warps = (`NW_BITS+1)'(gpu_req_if.rs2_data - 1); + // wspawn - assign warp_ctl_if.wspawn = wspawn; - assign warp_ctl_if.wspawn_pc = wspawn_pc; - assign warp_ctl_if.wspawn_new_active = wspawn_new_active; + wire [31:0] wspawn_pc = gpu_req_if.rs2_data; + wire [`NUM_WARPS-1:0] wspawn_wmask; + for (i = 0; i < `NUM_WARPS; i++) begin + assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]); + end + assign warp_ctl_if.wspawn = is_wspawn && gpu_req_fire; + assign warp_ctl_if.wspawn_pc = wspawn_pc; + assign warp_ctl_if.wspawn_wmask = wspawn_wmask; + + // split wire[`NUM_THREADS-1:0] split_new_use_mask; wire[`NUM_THREADS-1:0] split_new_later_mask; for (i = 0; i < `NUM_THREADS; i++) begin : masks_init wire curr_bool = (gpu_req_if.rs1_data[i] == 32'b1); - assign split_new_use_mask[i] = curr_valids[i] & (curr_bool); - assign split_new_later_mask[i] = curr_valids[i] & (!curr_bool); + assign split_new_use_mask[i] = gpu_req_if.thread_mask[i] & (curr_bool); + assign split_new_later_mask[i] = gpu_req_if.thread_mask[i] & (!curr_bool); end wire [`NT_BITS:0] num_valids; @@ -63,24 +64,20 @@ module VX_gpu_unit #( VX_countones #( .N(`NUM_THREADS) ) valids_counter ( - .valids(curr_valids), + .valids(gpu_req_if.thread_mask), .count (num_valids) ); - assign warp_ctl_if.is_split = is_split && (num_valids > 1); + assign warp_ctl_if.is_split = is_split && (num_valids > 1) && gpu_req_fire; assign warp_ctl_if.do_split = (split_new_use_mask != 0) && (split_new_use_mask != {`NUM_THREADS{1'b1}}); assign warp_ctl_if.split_new_mask = split_new_use_mask; assign warp_ctl_if.split_later_mask = split_new_later_mask; assign warp_ctl_if.split_save_pc = gpu_req_if.next_PC; + // commit + assign gpu_commit_if.valid = gpu_req_if.valid; + assign gpu_commit_if.issue_tag = gpu_req_if.issue_tag; + assign gpu_commit_if.data = 0; assign gpu_req_if.ready = gpu_commit_if.ready; - // commit - assign gpu_commit_if.valid = gpu_req_if.valid; - assign gpu_commit_if.warp_num = gpu_req_if.warp_num; - assign gpu_commit_if.curr_PC = gpu_req_if.curr_PC; - assign gpu_commit_if.wb = 0; - assign gpu_commit_if.rd = 0; - assign gpu_commit_if.data = 0; - endmodule \ No newline at end of file diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index e2be9f37..525aa220 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -18,61 +18,46 @@ module VX_icache_stage #( // reponse VX_ifetch_rsp_if ifetch_rsp_if ); + `UNUSED_VAR (reset) - reg [`NUM_THREADS-1:0] valid_threads [`NUM_WARPS-1:0]; + reg [31:0] rsp_curr_PC_buf [`NUM_WARPS-1:0]; + reg [`NUM_THREADS-1:0] rsp_thread_mask_buf [`NUM_WARPS-1:0]; - wire valid_inst = (| ifetch_req_if.valid); - - wire [`LOG2UP(`ICREQ_SIZE)-1:0] mrq_write_addr, mrq_read_addr; - wire mrq_full; - - wire mrq_push = icache_req_if.valid && icache_req_if.ready; - wire mrq_pop = icache_rsp_if.valid && icache_rsp_if.ready; - - assign mrq_read_addr = icache_rsp_if.tag[0][`LOG2UP(`ICREQ_SIZE)-1:0]; - - VX_index_queue #( - .DATAW (32 + `NW_BITS), - .SIZE (`ICREQ_SIZE) - ) mem_req_queue ( - .clk (clk), - .reset (reset), - .write_data ({ifetch_req_if.curr_PC, ifetch_req_if.warp_num}), - .write_addr (mrq_write_addr), - .push (mrq_push), - .full (mrq_full), - .pop (mrq_pop), - .read_addr (mrq_read_addr), - .read_data ({ifetch_rsp_if.curr_PC, ifetch_rsp_if.warp_num}), - `UNUSED_PIN (empty) - ); + wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; + + wire [`NW_BITS-1:0] req_tag = ifetch_req_if.warp_num; + wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[0][`NW_BITS-1:0]; always @(posedge clk) begin - if (mrq_push) begin - valid_threads[ifetch_req_if.warp_num] <= ifetch_req_if.valid; - end - end + if (icache_req_fire) begin + rsp_curr_PC_buf[req_tag] <= ifetch_req_if.curr_PC; + rsp_thread_mask_buf[req_tag] <= ifetch_req_if.thread_mask; + end + end // Icache Request - assign icache_req_if.valid = valid_inst && !mrq_full; + assign icache_req_if.valid = ifetch_req_if.valid; assign icache_req_if.rw = 0; assign icache_req_if.byteen = 4'b1111; assign icache_req_if.addr = ifetch_req_if.curr_PC[31:2]; assign icache_req_if.data = 0; - // Can't accept new request - assign ifetch_req_if.ready = !mrq_full && icache_req_if.ready; + // Can accept new request? + assign ifetch_req_if.ready = icache_req_if.ready; `ifdef DBG_CORE_REQ_INFO - assign icache_req_if.tag = {ifetch_req_if.curr_PC, 1'b0, 5'b0, ifetch_req_if.warp_num, mrq_write_addr}; + assign icache_req_if.tag = {ifetch_req_if.curr_PC, 1'b0, 5'b0, ifetch_req_if.warp_num, req_tag}; `else - assign icache_req_if.tag = mrq_write_addr; + assign icache_req_if.tag = req_tag; `endif - assign ifetch_rsp_if.valid = icache_rsp_if.valid ? valid_threads[ifetch_rsp_if.warp_num] : 0; - assign ifetch_rsp_if.instr = icache_rsp_if.data[0]; + assign ifetch_rsp_if.valid = icache_rsp_if.valid; + assign ifetch_rsp_if.warp_num = rsp_tag; + assign ifetch_rsp_if.thread_mask = rsp_thread_mask_buf[rsp_tag]; + assign ifetch_rsp_if.curr_PC = rsp_curr_PC_buf[rsp_tag]; + assign ifetch_rsp_if.instr = icache_rsp_if.data[0]; - // Can't accept new response + // Can accept new response? assign icache_rsp_if.ready = ifetch_rsp_if.ready; `SCOPE_ASSIGN(scope_icache_req_valid, icache_req_if.valid); @@ -89,10 +74,10 @@ module VX_icache_stage #( `ifdef DBG_PRINT_CORE_ICACHE always @(posedge clk) begin if (icache_req_if.valid && icache_req_if.ready) begin - $display("%t: I$%0d req: tag=%0h, PC=%0h, warp=%0d", $time, CORE_ID, mrq_write_addr, ifetch_req_if.curr_PC, ifetch_req_if.warp_num); + $display("%t: I$%0d req: warp=%0d, PC=%0h", $time, CORE_ID, ifetch_req_if.warp_num, ifetch_req_if.curr_PC); end if (icache_rsp_if.valid && icache_rsp_if.ready) begin - $display("%t: I$%0d rsp: tag=%0h, PC=%0h, warp=%0d, instr=%0h", $time, CORE_ID, mrq_read_addr, ifetch_rsp_if.curr_PC, ifetch_rsp_if.warp_num, ifetch_rsp_if.instr); + $display("%t: I$%0d rsp: warp=%0d, PC=%0h, instr=%0h", $time, CORE_ID, ifetch_rsp_if.warp_num, ifetch_req_if.curr_PC, ifetch_rsp_if.instr); end end `endif diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 3392f719..7d10193b 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -8,6 +8,7 @@ module VX_issue #( VX_decode_if decode_if, VX_wb_if writeback_if, + VX_commit_is_if commit_is_if, VX_alu_req_if alu_req_if, VX_lsu_req_if lsu_req_if, @@ -19,6 +20,7 @@ module VX_issue #( VX_gpr_data_if gpr_data_if(); wire schedule_delay; wire gpr_delay; + wire [`ISTAG_BITS-1:0] issue_tag, issue_tmp_tag; wire alu_busy = ~alu_req_if.ready; wire lsu_busy = ~lsu_req_if.ready; @@ -33,14 +35,16 @@ module VX_issue #( .clk (clk), .reset (reset), .decode_if (decode_if), - .writeback_if (writeback_if), + .writeback_if (writeback_if), + .commit_is_if (commit_is_if), .gpr_busy (gpr_delay), .alu_busy (alu_busy), .lsu_busy (lsu_busy), .csr_busy (csr_busy), .mul_busy (mul_busy), .fpu_busy (fpu_busy), - .gpu_busy (gpu_busy), + .gpu_busy (gpu_busy), + .issue_tag (issue_tag), .schedule_delay (schedule_delay), `UNUSED_PIN (is_empty) ); @@ -57,123 +61,54 @@ module VX_issue #( .gpr_delay (gpr_delay) ); - VX_alu_req_if alu_req_tmp_if(); - VX_lsu_req_if lsu_req_tmp_if(); - VX_csr_req_if csr_req_tmp_if(); - VX_mul_req_if mul_req_tmp_if(); - VX_fpu_req_if fpu_req_tmp_if(); - VX_gpu_req_if gpu_req_tmp_if(); + VX_decode_if decode_tmp_if(); + VX_gpr_data_if gpr_data_tmp_if(); + + wire stall = ~alu_req_if.ready || schedule_delay; + wire flush = alu_req_if.ready && schedule_delay; + + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) + ) decode_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (flush), + .in ({decode_if.valid, issue_tag, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.instr_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.frm, gpr_data_if.rs1_data, gpr_data_if.rs2_data, gpr_data_if.rs3_data}), + .out ({decode_tmp_if.valid, issue_tmp_tag, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.instr_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.frm, gpr_data_tmp_if.rs1_data, gpr_data_tmp_if.rs2_data, gpr_data_tmp_if.rs3_data}) + ); VX_issue_demux issue_demux ( - .decode_if (decode_if), - .gpr_data_if (gpr_data_if), - .alu_req_if (alu_req_tmp_if), - .lsu_req_if (lsu_req_tmp_if), - .csr_req_if (csr_req_tmp_if), - .mul_req_if (mul_req_tmp_if), - .fpu_req_if (fpu_req_tmp_if), - .gpu_req_if (gpu_req_tmp_if) + .decode_if (decode_tmp_if), + .gpr_data_if (gpr_data_tmp_if), + .issue_tag (issue_tmp_tag), + .alu_req_if (alu_req_if), + .lsu_req_if (lsu_req_if), + .csr_req_if (csr_req_if), + .mul_req_if (mul_req_if), + .fpu_req_if (fpu_req_if), + .gpu_req_if (gpu_req_if) ); - wire stall_alu = ~alu_req_if.ready || schedule_delay; - wire stall_lsu = ~lsu_req_if.ready || schedule_delay; - wire stall_csr = ~csr_req_if.ready || schedule_delay; - wire stall_mul = ~mul_req_if.ready || schedule_delay; - wire stall_fpu = ~fpu_req_if.ready || schedule_delay; - wire stall_gpu = ~gpu_req_if.ready || schedule_delay; - - wire flush_alu = alu_req_if.ready && schedule_delay; - wire flush_lsu = lsu_req_if.ready && schedule_delay; - wire flush_csr = csr_req_if.ready && schedule_delay; - wire flush_mul = mul_req_if.ready && schedule_delay; - wire flush_fpu = fpu_req_if.ready && schedule_delay; - wire flush_gpu = gpu_req_if.ready && schedule_delay; - - VX_generic_register #( - .N(`NUM_THREADS +`NW_BITS + 32 + `ALU_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + 32) - ) alu_reg ( - .clk (clk), - .reset (reset), - .stall (stall_alu), - .flush (flush_alu), - .in ({alu_req_tmp_if.valid, alu_req_tmp_if.warp_num, alu_req_tmp_if.curr_PC, alu_req_tmp_if.alu_op, alu_req_tmp_if.wb, alu_req_tmp_if.rd, alu_req_tmp_if.rs1_data, alu_req_tmp_if.rs2_data, alu_req_tmp_if.offset, alu_req_tmp_if.next_PC}), - .out ({alu_req_if.valid, alu_req_if.warp_num, alu_req_if.curr_PC, alu_req_if.alu_op, alu_req_if.wb, alu_req_if.rd, alu_req_if.rs1_data, alu_req_if.rs2_data, alu_req_if.offset, alu_req_if.next_PC}) - ); - - VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + 1 + `BYTEEN_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32) - ) lsu_reg ( - .clk (clk), - .reset (reset), - .stall (stall_lsu), - .flush (flush_lsu), - .in ({lsu_req_tmp_if.valid, lsu_req_tmp_if.warp_num, lsu_req_tmp_if.curr_PC, lsu_req_tmp_if.rw, lsu_req_tmp_if.byteen, lsu_req_tmp_if.wb, lsu_req_tmp_if.rd, lsu_req_tmp_if.base_addr, lsu_req_tmp_if.offset, lsu_req_tmp_if.store_data}), - .out ({lsu_req_if.valid, lsu_req_if.warp_num, lsu_req_if.curr_PC, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.wb, lsu_req_if.rd, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data}) - ); - - VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `CSR_BITS + 1 + `NR_BITS + `CSR_ADDR_SIZE + 32 + 1) - ) csr_reg ( - .clk (clk), - .reset (reset), - .stall (stall_csr), - .flush (flush_csr), - .in ({csr_req_tmp_if.valid, csr_req_tmp_if.warp_num, csr_req_tmp_if.curr_PC, csr_req_tmp_if.csr_op, csr_req_tmp_if.wb, csr_req_tmp_if.rd, csr_req_tmp_if.csr_addr, csr_req_tmp_if.csr_mask, csr_req_tmp_if.is_io}), - .out ({csr_req_if.valid, csr_req_if.warp_num, csr_req_if.curr_PC, csr_req_if.csr_op, csr_req_if.wb, csr_req_if.rd, csr_req_if.csr_addr, csr_req_if.csr_mask, csr_req_if.is_io}) - ); - - VX_generic_register #( - .N(`NUM_THREADS +`NW_BITS + 32 + `MUL_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) - ) mul_reg ( - .clk (clk), - .reset (reset), - .stall (stall_mul), - .flush (flush_mul), - .in ({mul_req_tmp_if.valid, mul_req_tmp_if.warp_num, mul_req_tmp_if.curr_PC, mul_req_tmp_if.mul_op, mul_req_tmp_if.wb, mul_req_tmp_if.rd, mul_req_tmp_if.rs1_data, mul_req_tmp_if.rs2_data}), - .out ({mul_req_if.valid, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.mul_op, mul_req_if.wb, mul_req_if.rd, mul_req_if.rs1_data, mul_req_if.rs2_data}) - ); - - VX_generic_register #( - .N(`NUM_THREADS +`NW_BITS + 32 + `FPU_BITS + 1 + `NR_BITS + 1 + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + `FRM_BITS) - ) fpu_reg ( - .clk (clk), - .reset (reset), - .stall (stall_fpu), - .flush (flush_fpu), - .in ({fpu_req_tmp_if.valid, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.wb, fpu_req_tmp_if.rd, fpu_req_tmp_if.rd_is_fp, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data, fpu_req_tmp_if.frm}), - .out ({fpu_req_if.valid, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.fpu_op, fpu_req_if.wb, fpu_req_if.rd, fpu_req_if.rd_is_fp, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data, fpu_req_if.frm}) - ); - - VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `GPU_BITS + (`NUM_THREADS * 32) + 32 + 32) - ) gpu_reg ( - .clk (clk), - .reset (reset), - .stall (stall_gpu), - .flush (flush_gpu), - .in ({gpu_req_tmp_if.valid, gpu_req_tmp_if.warp_num, gpu_req_tmp_if.curr_PC, gpu_req_tmp_if.gpu_op, gpu_req_tmp_if.rs1_data, gpu_req_tmp_if.rs2_data, gpu_req_tmp_if.next_PC}), - .out ({gpu_req_if.valid, gpu_req_if.warp_num, gpu_req_if.curr_PC, gpu_req_if.gpu_op, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.next_PC}) - ); - `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if ((| alu_req_tmp_if.valid) && ~stall_alu) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=ALU, op=%0d, wb=%d, rd=%0d, rs1=%0h, rs2=%0h, offset=%0h, next_PC=%0h", $time, CORE_ID, alu_req_tmp_if.warp_num, alu_req_tmp_if.curr_PC, alu_req_tmp_if.alu_op, alu_req_tmp_if.wb, alu_req_tmp_if.rd, alu_req_tmp_if.rs1_data, alu_req_tmp_if.rs2_data, alu_req_tmp_if.offset, alu_req_tmp_if.next_PC); + if (alu_req_if.valid && ~stall) begin + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, wb=%d, rd=%0d, rs1_data=%0h, rs2_data=%0h, offset=%0h, next_PC=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, alu_req_if.rs1_data, alu_req_if.rs2_data, alu_req_if.offset, alu_req_if.next_PC); end - if ((| mul_req_tmp_if.valid) && ~stall_mul) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, op=%0d, wb=%d, rd=%0d, rs1=%0h, rs2=%0h", $time, CORE_ID, mul_req_tmp_if.warp_num, mul_req_tmp_if.curr_PC, mul_req_tmp_if.mul_op, mul_req_tmp_if.wb, mul_req_tmp_if.rd, mul_req_tmp_if.rs1_data, mul_req_tmp_if.rs2_data); + if (lsu_req_if.valid && ~stall) begin + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, byteen=%b, baddr=%0h, offset=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, lsu_req_if.rw, decode_tmp_if.rd, decode_tmp_if.wb, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset); end - if ((| fpu_req_tmp_if.valid) && ~stall_fpu) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, op=%0d, wb=%d, rd=%0d, rs1=%0h, rs2=%0h", $time, CORE_ID, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.wb, fpu_req_tmp_if.rd, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data); + if (csr_req_if.valid && ~stall) begin + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, wb=%d, rd=%0d, addr=%0h, mask=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, csr_req_if.csr_addr, csr_req_if.csr_mask); end - if ((| lsu_req_tmp_if.valid) && ~stall_lsu) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=LSU, rw=%b, wb=%0d, rd=%0d, byteen=%b, baddr=%0h, offset=%0h", $time, CORE_ID, lsu_req_tmp_if.warp_num, lsu_req_tmp_if.curr_PC, lsu_req_tmp_if.rw, lsu_req_tmp_if.rd, lsu_req_tmp_if.wb, lsu_req_tmp_if.byteen, lsu_req_tmp_if.base_addr, lsu_req_tmp_if.offset); + if (mul_req_if.valid && ~stall) begin + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, wb=%d, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, mul_req_if.rs1_data, mul_req_if.rs2_data); end - if ((| csr_req_tmp_if.valid) && ~stall_csr) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=CSR, op=%0d, wb=%d, rd=%0d, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_tmp_if.warp_num, csr_req_tmp_if.curr_PC, csr_req_tmp_if.csr_op, csr_req_tmp_if.wb, csr_req_tmp_if.rd, csr_req_tmp_if.csr_addr, csr_req_tmp_if.csr_mask); + if (fpu_req_if.valid && ~stall) begin + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, wb=%d, rd=%0d, frm=%0h, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, fpu_req_if.frm, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); end - if ((| gpu_req_tmp_if.valid) && ~stall_gpu) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=GPU, op=%0d, rs1=%0h, rs2=%0h", $time, CORE_ID, gpu_req_tmp_if.warp_num, gpu_req_tmp_if.curr_PC, gpu_req_tmp_if.gpu_op, gpu_req_tmp_if.rs1_data, gpu_req_tmp_if.rs2_data); + if (gpu_req_if.valid && ~stall) begin + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data); end end `endif diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 06ea3a53..7198f97a 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -19,18 +19,19 @@ module VX_lsu_unit #( VX_commit_if lsu_commit_if ); - wire [`NUM_THREADS-1:0] use_valid; + wire use_valid; + wire [`NUM_THREADS-1:0] use_thread_mask; wire use_req_rw; wire [`NUM_THREADS-1:0][29:0] use_req_addr; wire [`NUM_THREADS-1:0][1:0] use_req_offset; wire [`NUM_THREADS-1:0][3:0] use_req_byteen; wire [`NUM_THREADS-1:0][31:0] use_req_data; - wire [`BYTEEN_BITS-1:0] mem_byteen; + wire [`BYTEEN_BITS-1:0] use_req_fullbyteen; wire [`NR_BITS-1:0] use_rd; wire [`NW_BITS-1:0] use_warp_num; + wire [`ISTAG_BITS-1:0] use_issue_tag; wire use_wb; wire [31:0] use_pc; - wire mrq_full; genvar i; @@ -60,126 +61,138 @@ module VX_lsu_unit #( assign mem_req_data[i] = lsu_req_if.store_data[i] << {mem_req_offset[i], 3'b0}; end - // Can accept new request - wire stall = ~dcache_req_if.ready || mrq_full; - assign lsu_req_if.ready = ~stall; + wire store_stalled; + wire stall_in = store_stalled || ~dcache_req_if.ready; + + // Can accept new request? + assign lsu_req_if.ready = ~stall_in; `IGNORE_WARNINGS_BEGIN wire [`NUM_THREADS-1:0][31:0] use_address; `IGNORE_WARNINGS_END VX_generic_register #( - .N(`NUM_THREADS + (`NUM_THREADS * 32) + `BYTEEN_BITS + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + `NW_BITS + 1 + 32) - ) mem_req_reg ( + .N(1 + `NW_BITS + `NUM_THREADS + `ISTAG_BITS + (`NUM_THREADS * 32) + `BYTEEN_BITS + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + 1 + 32) + ) lsu_req_reg ( .clk (clk), .reset (reset), - .stall (stall), + .stall (stall_in), .flush (0), - .in ({lsu_req_if.valid, full_address, lsu_req_if.byteen, lsu_req_if.rw, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data, lsu_req_if.rd, lsu_req_if.warp_num, lsu_req_if.wb, lsu_req_if.curr_PC}), - .out ({use_valid , use_address, mem_byteen , use_req_rw, use_req_addr, use_req_offset, use_req_byteen, use_req_data, use_rd , use_warp_num , use_wb , use_pc}) + .in ({lsu_req_if.valid, lsu_req_if.warp_num, lsu_req_if.thread_mask, lsu_req_if.issue_tag, full_address, lsu_req_if.byteen, lsu_req_if.rw, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.curr_PC}), + .out ({use_valid, use_warp_num, use_thread_mask, use_issue_tag, use_address, use_req_fullbyteen, use_req_rw, use_req_addr, use_req_offset, use_req_byteen, use_req_data, use_rd, use_wb, use_pc}) ); - reg [`NUM_THREADS-1:0] mem_rsp_mask[`DCREQ_SIZE-1:0]; + reg [`NUM_THREADS-1:0] mem_rsp_mask_buf [`ISSUEQ_SIZE-1:0]; + reg [`NUM_THREADS-1:0][1:0] mem_rsp_offset_buf [`ISSUEQ_SIZE-1:0]; + reg [`BYTEEN_BITS-1:0] mem_rsp_fullbyteen_buf [`ISSUEQ_SIZE-1:0]; + reg [`NUM_THREADS-1:0][31:0] mem_rsp_data_all_buf [`ISSUEQ_SIZE-1:0]; + reg [`NW_BITS-1:0] mem_rsp_warp_num_buf [`ISSUEQ_SIZE-1:0]; + reg [31:0] mem_rsp_curr_PC_buf [`ISSUEQ_SIZE-1:0]; + reg [`NR_BITS-1:0] mem_rsp_rd_buf [`ISSUEQ_SIZE-1:0]; - wire [`LOG2UP(`DCREQ_SIZE)-1:0] mrq_write_addr; - wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset; - wire [`BYTEEN_BITS-1:0] core_rsp_mem_read; - - wire mrq_push = (| dcache_req_if.valid) && dcache_req_if.ready - && (0 == use_req_rw); // only push read requests + reg [`NUM_THREADS-1:0][31:0] mem_rsp_data; - wire mrq_pop_part = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; - - wire [`LOG2UP(`DCREQ_SIZE)-1:0] mrq_read_addr = dcache_rsp_if.tag[0][`LOG2UP(`DCREQ_SIZE)-1:0]; + wire [`ISTAG_BITS-1:0] rsp_issue_tag = dcache_rsp_if.tag[0][`ISTAG_BITS-1:0]; - wire [`NUM_THREADS-1:0] mem_rsp_mask_upd = mem_rsp_mask[mrq_read_addr] & ~dcache_rsp_if.valid; + wire [`NUM_THREADS-1:0] mem_rsp_mask = mem_rsp_mask_buf [rsp_issue_tag]; + wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset = mem_rsp_offset_buf [rsp_issue_tag]; + wire [`BYTEEN_BITS-1:0] mem_rsp_fullbyteen = mem_rsp_fullbyteen_buf [rsp_issue_tag]; + wire [`NUM_THREADS-1:0][31:0] mem_rsp_data_all = mem_rsp_data_all_buf [rsp_issue_tag]; + wire [`NW_BITS-1:0] mem_rsp_warp_num = mem_rsp_warp_num_buf [rsp_issue_tag]; + wire [31:0] mem_rsp_curr_PC = mem_rsp_curr_PC_buf [rsp_issue_tag]; + wire [`NR_BITS-1:0] mem_rsp_rd = mem_rsp_rd_buf [rsp_issue_tag]; - wire mrq_pop = mrq_pop_part && (0 == mem_rsp_mask_upd); + wire [`NUM_THREADS-1:0] mem_rsp_mask_n = mem_rsp_mask & ~dcache_rsp_if.valid; - VX_index_queue #( - .DATAW (32 + 1 + (`NUM_THREADS * 2) + `BYTEEN_BITS + `NR_BITS + `NW_BITS), - .SIZE (`DCREQ_SIZE) - ) mem_req_queue ( - .clk (clk), - .reset (reset), - .write_data ({use_pc, use_wb, use_req_offset, mem_byteen, use_rd, use_warp_num}), - .write_addr (mrq_write_addr), - .push (mrq_push), - .full (mrq_full), - .pop (mrq_pop), - .read_addr (mrq_read_addr), - .read_data ({lsu_commit_if.curr_PC, lsu_commit_if.wb, mem_rsp_offset, core_rsp_mem_read, lsu_commit_if.rd, lsu_commit_if.warp_num}), - `UNUSED_PIN (empty) - ); + wire dcache_req_fire = (| dcache_req_if.valid) && dcache_req_if.ready; + wire dcache_rsp_fire = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; always @(posedge clk) begin - if (mrq_push) begin - mem_rsp_mask[mrq_write_addr] <= use_valid; + if (dcache_req_fire && (0 == use_req_rw)) begin + mem_rsp_mask_buf[use_issue_tag] <= use_thread_mask; + mem_rsp_offset_buf[use_issue_tag] <= use_req_offset; + mem_rsp_fullbyteen_buf[use_issue_tag] <= use_req_fullbyteen; + mem_rsp_data_all_buf[use_issue_tag] <= 0; + mem_rsp_warp_num_buf[use_issue_tag] <= use_warp_num; + mem_rsp_curr_PC_buf[use_issue_tag] <= use_pc; + mem_rsp_rd_buf[use_issue_tag] <= use_rd; end - if (mrq_pop_part) begin - mem_rsp_mask[mrq_read_addr] <= mem_rsp_mask_upd; + if (dcache_rsp_fire) begin + mem_rsp_mask_buf[rsp_issue_tag] <= mem_rsp_mask_n; + mem_rsp_data_all_buf[rsp_issue_tag] <= mem_rsp_data_all | mem_rsp_data; end end // Core Request - assign dcache_req_if.valid = use_valid & {`NUM_THREADS{~mrq_full}}; + assign dcache_req_if.valid = {`NUM_THREADS{use_valid && ~store_stalled}} & use_thread_mask; assign dcache_req_if.rw = {`NUM_THREADS{use_req_rw}}; assign dcache_req_if.byteen = use_req_byteen; assign dcache_req_if.addr = use_req_addr; assign dcache_req_if.data = use_req_data; `ifdef DBG_CORE_REQ_INFO - assign dcache_req_if.tag = {use_pc, use_wb, use_rd, use_warp_num, mrq_write_addr}; + assign dcache_req_if.tag = {use_pc, use_wb, use_rd, use_warp_num, use_issue_tag}; `else - assign dcache_req_if.tag = mrq_write_addr; + assign dcache_req_if.tag = use_issue_tag; `endif - // Core Response - reg [`NUM_THREADS-1:0][31:0] core_rsp_data; - + // Core Response for (i = 0; i < `NUM_THREADS; i++) begin wire [15:0] rsp_data_shifted = 16'(dcache_rsp_if.data[i] >> {mem_rsp_offset[i], 3'b0}); always @(*) begin - case (core_rsp_mem_read) - `BYTEEN_SB: core_rsp_data[i] = {{24{rsp_data_shifted[7]}}, rsp_data_shifted[7:0]}; - `BYTEEN_UB: core_rsp_data[i] = 32'(rsp_data_shifted[7:0]); - `BYTEEN_SH: core_rsp_data[i] = {{16{rsp_data_shifted[15]}}, rsp_data_shifted[15:0]}; - `BYTEEN_UH: core_rsp_data[i] = 32'(rsp_data_shifted[15:0]); - default: core_rsp_data[i] = dcache_rsp_if.data[i]; + case (mem_rsp_fullbyteen) + `BYTEEN_SB: mem_rsp_data[i] = {{24{rsp_data_shifted[7]}}, rsp_data_shifted[7:0]}; + `BYTEEN_UB: mem_rsp_data[i] = 32'(rsp_data_shifted[7:0]); + `BYTEEN_SH: mem_rsp_data[i] = {{16{rsp_data_shifted[15]}}, rsp_data_shifted[15:0]}; + `BYTEEN_UH: mem_rsp_data[i] = 32'(rsp_data_shifted[15:0]); + default: mem_rsp_data[i] = dcache_rsp_if.data[i]; endcase end end - assign lsu_commit_if.valid = dcache_rsp_if.valid; - assign lsu_commit_if.data = core_rsp_data; + wire is_store_rsp = dcache_req_fire && use_req_rw; + wire is_load_rsp = (| dcache_rsp_if.valid) && (0 == mem_rsp_mask_n); + + assign store_stalled = use_req_rw && (~lsu_commit_if.ready + || is_load_rsp); // arbitration prioritizes LOAD + + assign lsu_commit_if.valid = is_load_rsp || is_store_rsp; + assign lsu_commit_if.issue_tag = is_load_rsp ? rsp_issue_tag : use_issue_tag; + assign lsu_commit_if.data = mem_rsp_data | mem_rsp_data_all; // Can accept new cache response assign dcache_rsp_if.ready = lsu_commit_if.ready; - `SCOPE_ASSIGN(scope_dcache_req_valid, dcache_req_if.valid); - `SCOPE_ASSIGN(scope_dcache_req_warp_num, use_warp_num); - `SCOPE_ASSIGN(scope_dcache_req_curr_PC, use_pc); + // scope registration + `SCOPE_ASSIGN(scope_dcache_req_valid, dcache_req_if.valid); `SCOPE_ASSIGN(scope_dcache_req_addr, use_address); - `SCOPE_ASSIGN(scope_dcache_req_rw, core_req_rw); + `SCOPE_ASSIGN(scope_dcache_req_rw, dcache_req_if.rw ); `SCOPE_ASSIGN(scope_dcache_req_byteen,dcache_req_if.byteen); `SCOPE_ASSIGN(scope_dcache_req_data, dcache_req_if.data); `SCOPE_ASSIGN(scope_dcache_req_tag, dcache_req_if.tag); - `SCOPE_ASSIGN(scope_dcache_req_ready, dcache_req_if.ready); + `SCOPE_ASSIGN(scope_dcache_req_ready, dcache_req_if.ready); + `SCOPE_ASSIGN(scope_dcache_req_warp_num, use_warp_num); + `SCOPE_ASSIGN(scope_dcache_req_curr_PC, use_pc); `SCOPE_ASSIGN(scope_dcache_rsp_valid, dcache_rsp_if.valid); `SCOPE_ASSIGN(scope_dcache_rsp_data, dcache_rsp_if.data); `SCOPE_ASSIGN(scope_dcache_rsp_tag, dcache_rsp_if.tag); `SCOPE_ASSIGN(scope_dcache_rsp_ready, dcache_rsp_if.ready); + + `UNUSED_VAR (mem_rsp_warp_num) + `UNUSED_VAR (mem_rsp_curr_PC) + `UNUSED_VAR (mem_rsp_rd) + `UNUSED_VAR (use_wb) `ifdef DBG_PRINT_CORE_DCACHE always @(posedge clk) begin if ((| dcache_req_if.valid) && dcache_req_if.ready) begin - $display("%t: D$%0d req: valid=%b, warp=%0d, PC=%0h, addr=%0h, tag=%0h, rw=%0b, rd=%0d, byteen=%0h, data=%0h", - $time, CORE_ID, use_valid, use_warp_num, use_pc, use_address, mrq_write_addr, use_req_rw, use_rd, use_req_byteen, use_req_data); + $display("%t: D$%0d req: valid=%b, warp=%0d, PC=%0h, addr=%0h, tag=%0h, rd=%0d, rw=%0b, byteen=%0h, data=%0h", + $time, CORE_ID, dcache_req_if.valid, use_warp_num, use_pc, use_address, dcache_req_if.tag, use_rd, dcache_req_if.rw, dcache_req_if.byteen, dcache_req_if.data); end if ((| dcache_rsp_if.valid) && dcache_rsp_if.ready) begin $display("%t: D$%0d rsp: valid=%b, warp=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h", - $time, CORE_ID, lsu_commit_if.valid, lsu_commit_if.warp_num, lsu_commit_if.curr_PC, mrq_read_addr, lsu_commit_if.rd, lsu_commit_if.data); + $time, CORE_ID, dcache_rsp_if.valid, mem_rsp_warp_num, mem_rsp_curr_PC, dcache_rsp_if.tag, mem_rsp_rd, dcache_rsp_if.data); end end `endif diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 8399857a..f67a9685 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -245,8 +245,8 @@ module VX_mem_unit # ( .SNOOP_FORWARDING (0), .DRAM_ENABLE (1), .WRITE_ENABLE (0), - .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), - .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), + .CORE_TAG_WIDTH (`ICORE_TAG_WIDTH), + .CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS), .DRAM_TAG_WIDTH (`IDRAM_TAG_WIDTH) ) icache ( `SCOPE_SIGNALS_CACHE_UNBIND diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 9ec58aa4..34744bd0 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -7,31 +7,58 @@ module VX_mul_unit #( input wire reset, // Inputs - VX_mul_req_if mul_req_if, + VX_mul_req_if alu_req_if, // Outputs - VX_commit_if mul_commit_if + VX_commit_if alu_commit_if ); - reg [`NUM_THREADS-1:0][31:0] alu_result; - wire [`NUM_THREADS-1:0][63:0] mul_result; - wire [`NUM_THREADS-1:0][31:0] div_result; - wire [`NUM_THREADS-1:0][31:0] rem_result; + + wire [`MUL_BITS-1:0] alu_op = alu_req_if.mul_op; + wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data; + wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data; - wire [`MUL_BITS-1:0] alu_op = mul_req_if.mul_op; - wire [`NUM_THREADS-1:0][31:0] alu_in1 = mul_req_if.rs1_data; - wire [`NUM_THREADS-1:0][31:0] alu_in2 = mul_req_if.rs2_data; + wire [`NUM_THREADS-1:0][31:0] mul_result, div_result; - genvar i; + wire stall_mul, stall_div; + + wire is_mul_op = (alu_op == `MUL_MUL); + wire is_div_op = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU); + + reg [`NUM_THREADS-1:0] is_div_op_in; + wire [`NUM_THREADS-1:0] is_div_op_out; + wire is_mul_op_out; + + genvar i; for (i = 0; i < `NUM_THREADS; i++) begin wire [32:0] mul_in1 = {(alu_op != `MUL_MULHU) & alu_in1[i][31], alu_in1[i]}; wire [32:0] mul_in2 = {(alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU) & alu_in2[i][31], alu_in2[i]}; - - wire [32:0] div_in1 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in1[i][31], alu_in1[i]}; - wire [32:0] div_in2 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in2[i][31], alu_in2[i]}; - VX_mult #( + reg [32:0] div_in1, div_in2; + + // handle divide by zero + always @(*) begin + is_div_op_in[i] = is_div_op; + div_in1 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in1[i][31], alu_in1[i]}; + div_in2 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in2[i][31], alu_in2[i]}; + + if (0 == alu_in2[i]) begin + if (is_div_op) begin + div_in1 = {1'b0, 32'hFFFFFFFF}; // quotient = (0xFFFFFFFF / 1) + div_in2 = 1; + end else begin + is_div_op_in[i] = 1; // remainder = (in1 / 1) + div_in2 = 1; + end + end + end + + wire [63:0] mul_result_tmp; + wire [31:0] div_result_tmp; + wire [31:0] rem_result_tmp; + + VX_multiplier #( .WIDTHA(33), .WIDTHB(33), .WIDTHP(64), @@ -40,9 +67,10 @@ module VX_mul_unit #( ) multiplier ( .clk(clk), .reset(reset), + .clk_en(~stall_mul), .dataa(mul_in1), .datab(mul_in2), - .result(mul_result[i]) + .result(mul_result_tmp) ); VX_divide #( @@ -53,75 +81,58 @@ module VX_mul_unit #( .NSIGNED(1), .DSIGNED(1), .PIPELINE(`LATENCY_IDIV) - ) sdiv ( + ) divide ( .clk(clk), .reset(reset), + .clk_en(~stall_div), .numer(div_in1), .denom(div_in2), - .quotient(div_result[i]), - .remainder(rem_result[i]) + .quotient(div_result_tmp), + .remainder(rem_result_tmp) ); - - always @(*) begin - case (alu_op) - `MUL_MUL: alu_result[i] = mul_result[i][31:0]; - `MUL_MULH, - `MUL_MULHSU, - `MUL_MULHU: alu_result[i] = mul_result[i][63:32]; - `MUL_DIV, - `MUL_DIVU: alu_result[i] = (alu_in2[i] == 0) ? 32'hffffffff : div_result[i]; - `MUL_REM, - `MUL_REMU: alu_result[i] = (alu_in2[i] == 0) ? alu_in1[i] : rem_result[i]; - default: alu_result[i] = alu_in1[i] + alu_in2[i]; // ADD, LUI, AUIPC, FENCE - endcase - end + + assign mul_result[i] = is_mul_op_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; + assign div_result[i] = is_div_op_out[i] ? div_result_tmp : rem_result_tmp; end - wire stall; + wire mul_valid_out; + wire div_valid_out; - reg result_avail; - reg [4:0] pending_ctr; - wire [4:0] instr_delay = `IS_DIV_OP(alu_op) ? `LATENCY_IDIV : `LATENCY_IMUL; + wire [`ISTAG_BITS-1:0] mul_issue_tag; + wire [`ISTAG_BITS-1:0] div_issue_tag; - always @(posedge clk) begin - if (reset) begin - result_avail <= 0; - pending_ctr <= 0; - end else begin - if (result_avail && !stall) begin - result_avail <= 0; - pending_ctr <= 0; - end - if ((| mul_req_if.valid) && (pending_ctr == 0)) begin - pending_ctr <= instr_delay - 1; - if (instr_delay == 1) - result_avail <= 1; - end else if (pending_ctr != 0) begin - pending_ctr <= pending_ctr - 1; - if (pending_ctr == 1) - result_avail <= 1; - end - end - end + VX_shift_register #( + .DATAW(1 + `ISTAG_BITS + 1), + .DEPTH(`LATENCY_IMUL) + ) mul_delay ( + .clk(clk), + .reset(reset), + .enable(~stall_mul), + .in({alu_req_if.valid && ~`IS_DIV_OP(alu_op), alu_req_if.issue_tag, is_mul_op}), + .out({mul_valid_out, mul_issue_tag, is_mul_op_out}) + ); - wire pipeline_stall = ~result_avail && (| mul_req_if.valid); + VX_shift_register #( + .DATAW(1 + `ISTAG_BITS + `NUM_THREADS), + .DEPTH(`LATENCY_IDIV) + ) div_delay ( + .clk(clk), + .reset(reset), + .enable(~stall_div), + .in({alu_req_if.valid && `IS_DIV_OP(alu_op), alu_req_if.issue_tag, is_div_op_in}), + .out({div_valid_out, div_issue_tag, is_div_op_out}) + ); + + wire stall_out = (~alu_commit_if.ready && alu_commit_if.valid); + assign stall_mul = stall_out; + assign stall_div = stall_out + || (mul_valid_out && div_valid_out); // arbitration prioritizes MUL + + // can accept new request? + assign alu_req_if.ready = ~(stall_mul || stall_div); + + assign alu_commit_if.valid = mul_valid_out || div_valid_out; + assign alu_commit_if.issue_tag = mul_valid_out ? mul_issue_tag : div_issue_tag; + assign alu_commit_if.data = mul_valid_out ? mul_result : div_result; - assign stall = (~mul_commit_if.ready && (| mul_commit_if.valid)) - || pipeline_stall; - - wire flush = mul_commit_if.ready && pipeline_stall; - - VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)) - ) mul_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (flush), - .in ({mul_req_if.valid, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.rd, mul_req_if.wb, alu_result}), - .out ({mul_commit_if.valid, mul_commit_if.warp_num, mul_commit_if.curr_PC, mul_commit_if.rd, mul_commit_if.wb, mul_commit_if.data}) - ); - - assign mul_req_if.ready = ~stall; - endmodule \ No newline at end of file diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index 23c21ab9..09a73ddf 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -113,13 +113,14 @@ module VX_pipeline #( VX_fpu_req_if fpu_req_if(); VX_gpu_req_if gpu_req_if(); VX_wb_if writeback_if(); + VX_commit_is_if commit_is_if(); VX_wstall_if wstall_if(); VX_join_if join_if(); VX_commit_if alu_commit_if(); VX_commit_if lsu_commit_if(); VX_commit_if csr_commit_if(); VX_commit_if mul_commit_if(); - VX_commit_fp_if fpu_commit_if(); + VX_commit_if fpu_commit_if(); VX_commit_if gpu_commit_if(); VX_fetch #( @@ -156,6 +157,7 @@ module VX_pipeline #( .decode_if (decode_if), .writeback_if (writeback_if), + .commit_is_if (commit_is_if), .alu_req_if (alu_req_if), .lsu_req_if (lsu_req_if), @@ -212,6 +214,7 @@ module VX_pipeline #( .fpu_commit_if (fpu_commit_if), .gpu_commit_if (gpu_commit_if), + .commit_is_if (commit_is_if), .writeback_if (writeback_if), .perf_cntrs_if (perf_cntrs_if) ); diff --git a/hw/rtl/VX_scheduler.v b/hw/rtl/VX_scheduler.v index f20b8b87..c462d157 100644 --- a/hw/rtl/VX_scheduler.v +++ b/hw/rtl/VX_scheduler.v @@ -8,6 +8,7 @@ module VX_scheduler #( VX_decode_if decode_if, VX_wb_if writeback_if, + VX_commit_is_if commit_is_if, input wire gpr_busy, input wire alu_busy, input wire lsu_busy, @@ -15,29 +16,44 @@ module VX_scheduler #( input wire mul_busy, input wire fpu_busy, input wire gpu_busy, + output wire [`ISTAG_BITS-1:0] issue_tag, output wire schedule_delay, output wire is_empty ); localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1); - reg [`NUM_THREADS-1:0] rename_table [`NUM_WARPS-1:0][(`NUM_REGS*2)-1:0]; - reg busy_table [`NUM_WARPS-1:0][(`NUM_REGS*2)-1:0]; - reg [CTVW-1:0] count_valid; + `ifdef EXT_F_ENABLE + localparam NREGS = (`NUM_REGS * 2); + reg inuse_table [`NUM_WARPS-1:0][NREGS-1:0]; + wire [`NR_BITS:0] read_rs1 = {decode_if.rs1_is_fp, decode_if.rs1}; + wire [`NR_BITS:0] read_rs2 = {decode_if.rs2_is_fp, decode_if.rs2}; + wire [`NR_BITS:0] read_rs3 = {1'b1, decode_if.rs3}; + wire [`NR_BITS:0] read_rd = {decode_if.rd_is_fp, decode_if.rd}; + wire [`NR_BITS:0] write_rd = {writeback_if.rd_is_fp, writeback_if.rd}; + wire rs3_inuse = inuse_table[decode_if.warp_num][read_rs3]; + `else + localparam NREGS = `NUM_REGS; + reg inuse_table [`NUM_WARPS-1:0][NREGS-1:0]; + wire [`NR_BITS-1:0] read_rs1 = decode_if.rs1; + wire [`NR_BITS-1:0] read_rs2 = decode_if.rs2; + wire [`NR_BITS-1:0] read_rd = decode_if.rd; + wire [`NR_BITS-1:0] write_rd = writeback_if.rd; + wire rs3_inuse = 0; + `endif - reg [`NR_BITS:0] read_rd = {decode_if.rd_is_fp, decode_if.rd}; - reg [`NR_BITS:0] write_rd = {writeback_if.rd_is_fp, writeback_if.rd}; + reg [`NUM_THREADS-1:0] inuse_registers [`NUM_WARPS-1:0][NREGS-1:0]; + reg [CTVW-1:0] count_valid; - wire rs1_rename = busy_table[decode_if.warp_num][{decode_if.rs1_is_fp, decode_if.rs1}]; - wire rs2_rename = busy_table[decode_if.warp_num][{decode_if.rs1_is_fp, decode_if.rs2}]; - wire rs3_rename = busy_table[decode_if.warp_num][{1'b1, decode_if.rs3}]; - wire rd_rename = busy_table[decode_if.warp_num][read_rd]; + wire rs1_inuse = inuse_table[decode_if.warp_num][read_rs1]; + wire rs2_inuse = inuse_table[decode_if.warp_num][read_rs2]; + wire rd_inuse = inuse_table[decode_if.warp_num][read_rd]; - wire rs1_rename_qual = rs1_rename && decode_if.use_rs1; - wire rs2_rename_qual = rs2_rename && decode_if.use_rs2; - wire rs3_rename_qual = rs3_rename && decode_if.use_rs3; - wire rd_rename_qual = rd_rename && decode_if.wb; + wire rs1_inuse_qual = rs1_inuse && decode_if.use_rs1; + wire rs2_inuse_qual = rs2_inuse && decode_if.use_rs2; + wire rs3_inuse_qual = rs3_inuse && decode_if.use_rs3; + wire rd_inuse_qual = rd_inuse && decode_if.wb; - wire rename_valid = (rs1_rename_qual || rs2_rename_qual || rs3_rename_qual || rd_rename_qual); + wire rename_valid = (rs1_inuse_qual || rs2_inuse_qual || rs3_inuse_qual || rd_inuse_qual); wire ex_stalled = ((gpr_busy) || (alu_busy && (decode_if.ex_type == `EX_ALU)) @@ -47,41 +63,61 @@ module VX_scheduler #( || (fpu_busy && (decode_if.ex_type == `EX_FPU)) || (gpu_busy && (decode_if.ex_type == `EX_GPU))); - wire stall = (ex_stalled || rename_valid) && (| decode_if.valid); + wire iq_full; - wire acquire_rd = (| decode_if.valid) && (decode_if.wb != 0) && ~stall; + wire stall = (ex_stalled || rename_valid || iq_full) && decode_if.valid; + + wire acquire_rd = decode_if.valid && (decode_if.wb != 0) && ~stall; - wire release_rd = (| writeback_if.valid); + wire release_rd = writeback_if.valid; - wire [`NUM_THREADS-1:0] valid_wb_new_mask = rename_table[writeback_if.warp_num][write_rd] & ~writeback_if.valid; + wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[writeback_if.warp_num][write_rd] & ~writeback_if.thread_mask; - reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == valid_wb_new_mask))) ? (count_valid + 1) : - (~acquire_rd && (release_rd && (0 == valid_wb_new_mask))) ? (count_valid - 1) : + reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == inuse_registers_n))) ? (count_valid + 1) : + (~acquire_rd && (release_rd && (0 == inuse_registers_n))) ? (count_valid - 1) : count_valid; always @(posedge clk) begin if (reset) begin integer i, w; for (w = 0; w < `NUM_WARPS; w++) begin - for (i = 0; i < 32; i++) begin - rename_table[w][i] <= 0; - busy_table[w][i] <= 0; + for (i = 0; i < NREGS; i++) begin + inuse_registers[w][i] <= 0; + inuse_table[w][i] <= 0; end end count_valid <= 0; end else begin if (acquire_rd) begin - rename_table[decode_if.warp_num][read_rd] <= decode_if.valid; - busy_table[decode_if.warp_num][read_rd] <= 1; + inuse_registers[decode_if.warp_num][read_rd] <= decode_if.thread_mask; + inuse_table[decode_if.warp_num][read_rd] <= 1; end if (release_rd) begin - assert(rename_table[writeback_if.warp_num][write_rd] != 0); - rename_table[writeback_if.warp_num][write_rd] <= valid_wb_new_mask; - busy_table[writeback_if.warp_num][write_rd] <= (| valid_wb_new_mask); + assert(inuse_table[writeback_if.warp_num][write_rd] != 0); + inuse_registers[writeback_if.warp_num][write_rd] <= inuse_registers_n; + inuse_table[writeback_if.warp_num][write_rd] <= (| inuse_registers_n); end count_valid <= count_valid_next; end end + wire ib_acquire = decode_if.valid && ~stall; + + VX_cam_buffer #( + .DATAW ($bits(is_data_t)), + .SIZE (`ISSUEQ_SIZE), + .RPORTS (`NUM_EXS) + ) issue_buffer ( + .clk (clk), + .reset (reset), + .write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rd_is_fp, decode_if.wb}), + .write_addr (issue_tag), + .acquire_slot (ib_acquire), + .release_slot ({commit_is_if.alu_valid, commit_is_if.lsu_valid, commit_is_if.csr_valid, commit_is_if.mul_valid, commit_is_if.fpu_valid, commit_is_if.gpu_valid}), + .read_addr ({commit_is_if.alu_tag, commit_is_if.lsu_tag, commit_is_if.csr_tag, commit_is_if.mul_tag, commit_is_if.fpu_tag, commit_is_if.gpu_tag}), + .read_data ({commit_is_if.alu_data, commit_is_if.lsu_data, commit_is_if.csr_data, commit_is_if.mul_data, commit_is_if.fpu_data, commit_is_if.gpu_data}), + .full (iq_full) + ); + assign decode_if.ready = ~stall; assign schedule_delay = stall; @@ -91,7 +127,7 @@ module VX_scheduler #( `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (stall) begin - $display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, rename=%b%b%b, alu=%b, lsu=%b, csr=%b, mul=%b, fpu=%b, gpu=%b", $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, rd_rename_qual, rs1_rename_qual, rs2_rename_qual, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy); + $display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, iq_full=%b, inuse=%b%b%b%b, alu=%b, lsu=%b, csr=%b, mul=%b, fpu=%b, gpu=%b", $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, iq_full, rd_inuse_qual, rs1_inuse_qual, rs2_inuse_qual, rs3_inuse_qual, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy); end end `endif diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 571244d8..00e44439 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -97,9 +97,9 @@ module VX_warp_sched #( end else begin if (warp_ctl_if.wspawn) begin - warp_active <= warp_ctl_if.wspawn_new_active; + warp_active <= warp_ctl_if.wspawn_wmask; + use_wspawn <= warp_ctl_if.wspawn_wmask & (~`NUM_WARPS'(1)); use_wspawn_pc <= warp_ctl_if.wspawn_pc; - use_wspawn <= warp_ctl_if.wspawn_new_active & (~`NUM_WARPS'(1)); end if (warp_ctl_if.is_barrier) begin @@ -112,6 +112,10 @@ module VX_warp_sched #( end else if (warp_ctl_if.change_mask) begin thread_masks[warp_ctl_if.warp_num] <= warp_ctl_if.thread_mask; warp_stalled[warp_ctl_if.warp_num] <= 0; + if (0 == warp_ctl_if.thread_mask) begin + warp_active[warp_ctl_if.warp_num] <= 0; + visible_active[warp_ctl_if.warp_num] <= 0; + end end else if (join_if.is_join && !didnt_split) begin if (!join_fall) begin warp_pcs[join_if.warp_num] <= join_pc; @@ -126,12 +130,7 @@ module VX_warp_sched #( end else begin didnt_split <= 1; end - end - - if (warp_ctl_if.whalt) begin - warp_active[warp_ctl_if.warp_num] <= 0; - visible_active[warp_ctl_if.warp_num] <= 0; - end + end if (update_use_wspawn) begin use_wspawn[warp_to_schedule] <= 0; @@ -167,7 +166,7 @@ module VX_warp_sched #( if (scheduled_warp && !stall) begin warp_lock[warp_num] <= 1; end - if ((| ifetch_rsp_if.valid) && ifetch_rsp_if.ready) begin + if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin warp_lock[ifetch_rsp_if.warp_num] <= 0; end @@ -192,7 +191,7 @@ module VX_warp_sched #( assign b_mask = barrier_stall_mask[warp_ctl_if.barrier_id][`NUM_WARPS-1:0]; - assign reached_barrier_limit = (b_count == warp_ctl_if.num_warps); + assign reached_barrier_limit = (b_count == warp_ctl_if.barrier_num_warps); assign wstall_this_cycle = wstall_if.wstall && (wstall_if.warp_num == warp_to_schedule); // Maybe bug @@ -263,17 +262,17 @@ module VX_warp_sched #( `UNUSED_PIN (grant_onehot) ); - assign stall = ~ifetch_req_if.ready && (| ifetch_req_if.valid); + assign stall = ~ifetch_req_if.ready && ifetch_req_if.valid; VX_generic_register #( - .N(`NUM_THREADS + 32 + `NW_BITS) + .N(1 + `NUM_THREADS + 32 + `NW_BITS) ) fetch_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({thread_mask, warp_pc, warp_num}), - .out ({ifetch_req_if.valid, ifetch_req_if.curr_PC, ifetch_req_if.warp_num}) + .in ({(| thread_mask), thread_mask, warp_pc, warp_num}), + .out ({ifetch_req_if.valid, ifetch_req_if.thread_mask, ifetch_req_if.curr_PC, ifetch_req_if.warp_num}) ); assign busy = (warp_active != 0); diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 3506e33a..33bc230f 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -10,79 +10,87 @@ module VX_writeback #( VX_commit_if alu_commit_if, VX_commit_if lsu_commit_if, VX_commit_if mul_commit_if, - VX_commit_fp_if fpu_commit_if, + VX_commit_if fpu_commit_if, VX_commit_if csr_commit_if, + VX_commit_is_if commit_is_if, // outputs VX_wb_if writeback_if ); - wire alu_valid = (| alu_commit_if.valid) && alu_commit_if.wb; - wire lsu_valid = (| lsu_commit_if.valid) && lsu_commit_if.wb; - wire csr_valid = (| csr_commit_if.valid) && csr_commit_if.wb; - wire mul_valid = (| mul_commit_if.valid) && mul_commit_if.wb; - wire fpu_valid = (| fpu_commit_if.valid) && fpu_commit_if.wb; + wire alu_valid = alu_commit_if.valid && commit_is_if.alu_data.wb; + wire lsu_valid = lsu_commit_if.valid && commit_is_if.lsu_data.wb; + wire csr_valid = csr_commit_if.valid && commit_is_if.csr_data.wb; + wire mul_valid = mul_commit_if.valid && commit_is_if.mul_data.wb; + wire fpu_valid = fpu_commit_if.valid && commit_is_if.fpu_data.wb; VX_wb_if writeback_tmp_if(); - assign writeback_tmp_if.valid = lsu_valid ? lsu_commit_if.valid : - fpu_valid ? fpu_commit_if.valid : - mul_valid ? mul_commit_if.valid : - alu_valid ? alu_commit_if.valid : - csr_valid ? csr_commit_if.valid : + assign writeback_tmp_if.valid = alu_valid ? alu_commit_if.valid : + lsu_valid ? lsu_commit_if.valid : + csr_valid ? csr_commit_if.valid : + mul_valid ? mul_commit_if.valid : + fpu_valid ? fpu_commit_if.valid : 0; - assign writeback_tmp_if.warp_num = lsu_valid ? lsu_commit_if.warp_num : - fpu_valid ? fpu_commit_if.warp_num : - mul_valid ? mul_commit_if.warp_num : - alu_valid ? alu_commit_if.warp_num : - csr_valid ? csr_commit_if.warp_num : + assign writeback_tmp_if.warp_num = alu_valid ? commit_is_if.alu_data.warp_num : + lsu_valid ? commit_is_if.lsu_data.warp_num : + csr_valid ? commit_is_if.csr_data.warp_num : + mul_valid ? commit_is_if.mul_data.warp_num : + fpu_valid ? commit_is_if.fpu_data.warp_num : + 0; + + assign writeback_tmp_if.thread_mask = alu_valid ? commit_is_if.alu_data.thread_mask : + lsu_valid ? commit_is_if.lsu_data.thread_mask : + csr_valid ? commit_is_if.csr_data.thread_mask : + mul_valid ? commit_is_if.mul_data.thread_mask : + fpu_valid ? commit_is_if.fpu_data.thread_mask : 0; - assign writeback_tmp_if.rd = lsu_valid ? lsu_commit_if.rd : - fpu_valid ? fpu_commit_if.rd : - mul_valid ? mul_commit_if.rd : - alu_valid ? alu_commit_if.rd : - csr_valid ? csr_commit_if.rd : + assign writeback_tmp_if.rd = alu_valid ? commit_is_if.alu_data.rd : + lsu_valid ? commit_is_if.lsu_data.rd : + csr_valid ? commit_is_if.csr_data.rd : + mul_valid ? commit_is_if.mul_data.rd : + fpu_valid ? commit_is_if.fpu_data.rd : 0; - assign writeback_tmp_if.rd_is_fp = lsu_valid ? 0 : - fpu_valid ? fpu_commit_if.rd_is_fp : - mul_valid ? 0 : - alu_valid ? 0 : + assign writeback_tmp_if.rd_is_fp = alu_valid ? 0 : + lsu_valid ? 0 : csr_valid ? 0 : + mul_valid ? 0 : + fpu_valid ? commit_is_if.fpu_data.rd_is_fp : 0; - assign writeback_tmp_if.data = lsu_valid ? lsu_commit_if.data : - fpu_valid ? fpu_commit_if.data : - mul_valid ? mul_commit_if.data : - alu_valid ? alu_commit_if.data : - csr_valid ? csr_commit_if.data : + assign writeback_tmp_if.data = alu_valid ? alu_commit_if.data : + lsu_valid ? lsu_commit_if.data : + csr_valid ? csr_commit_if.data : + mul_valid ? mul_commit_if.data : + fpu_valid ? fpu_commit_if.data : 0; - wire stall = ~writeback_if.ready && (| writeback_if.valid); + wire stall = ~writeback_if.ready && writeback_if.valid; VX_generic_register #( - .N(`NUM_THREADS + `NW_BITS + `NR_BITS + (`NUM_THREADS * 32) + 1) + .N(1 + `NW_BITS + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1) ) wb_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.rd, writeback_tmp_if.rd_is_fp, writeback_tmp_if.data}), - .out ({writeback_if.valid, writeback_if.warp_num, writeback_if.rd, writeback_if.rd_is_fp, writeback_if.data}) + .in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.thread_mask, writeback_tmp_if.rd, writeback_tmp_if.rd_is_fp, writeback_tmp_if.data}), + .out ({writeback_if.valid, writeback_if.warp_num, writeback_if.thread_mask, writeback_if.rd, writeback_if.rd_is_fp, writeback_if.data}) ); - assign lsu_commit_if.ready = !stall; - assign fpu_commit_if.ready = !stall && !lsu_valid; - assign mul_commit_if.ready = !stall && !lsu_valid && !fpu_valid; - assign alu_commit_if.ready = !stall && !lsu_valid && !fpu_valid && !mul_valid; - assign csr_commit_if.ready = !stall && !lsu_valid && !fpu_valid && !mul_valid && !alu_valid; + assign alu_commit_if.ready = !stall; + assign lsu_commit_if.ready = !stall && !alu_valid; + assign csr_commit_if.ready = !stall && !alu_valid && !lsu_valid; + assign mul_commit_if.ready = !stall && !alu_valid && !lsu_valid && !csr_valid; + assign fpu_commit_if.ready = !stall && !alu_valid && !lsu_valid && !csr_valid && !mul_valid; // special workaround to control RISC-V benchmarks termination on Verilator reg [31:0] last_data_wb /* verilator public */; always @(posedge clk) begin - if ((| writeback_tmp_if.valid) && ~stall && (writeback_tmp_if.rd == 28)) begin + if (writeback_tmp_if.valid && ~stall && (writeback_tmp_if.rd == 28)) begin last_data_wb <= writeback_tmp_if.data[0]; end end diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index c99b6fb4..f0692c5b 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -513,8 +513,8 @@ module VX_bank #( .reset (reset), .stall (stall_bank_pipe), .flush (0), - .in ({mrvq_recover_ready_state_st1e, is_mrvq_st1e_st2, mrvq_init_ready_state_st1e , snp_to_mrvq_st1e, is_snp_st1e, snp_invalidate_st1e, fill_saw_dirty_st1e, is_fill_st1[STAGE_1_CYCLES-1] , qual_valid_st1e_2, addr_st1e, wsel_st1[STAGE_1_CYCLES-1], writeword_st1[STAGE_1_CYCLES-1], readword_st1e, readdata_st1e, readtag_st1e, miss_st1e, dirty_st1e, dirtyb_st1e, inst_meta_st1[STAGE_1_CYCLES-1]}), - .out ({mrvq_recover_ready_state_st2 , is_mrvq_st2 , mrvq_init_ready_state_unqual_st2, snp_to_mrvq_st2 , is_snp_st2 , snp_invalidate_st2, fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2 , wsel_st2, writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , dirtyb_st2, inst_meta_st2 }) + .in ({mrvq_recover_ready_state_st1e, is_mrvq_st1e_st2, mrvq_init_ready_state_st1e, snp_to_mrvq_st1e, is_snp_st1e, snp_invalidate_st1e, fill_saw_dirty_st1e, is_fill_st1[STAGE_1_CYCLES-1], qual_valid_st1e_2, addr_st1e, wsel_st1[STAGE_1_CYCLES-1], writeword_st1[STAGE_1_CYCLES-1], readword_st1e, readdata_st1e, readtag_st1e, miss_st1e, dirty_st1e, dirtyb_st1e, inst_meta_st1[STAGE_1_CYCLES-1]}), + .out ({mrvq_recover_ready_state_st2 , is_mrvq_st2 , mrvq_init_ready_state_unqual_st2, snp_to_mrvq_st2 , is_snp_st2 , snp_invalidate_st2, fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2, wsel_st2, writeword_st2, readword_st2, readdata_st2, readtag_st2, miss_st2, dirty_st2, dirtyb_st2, inst_meta_st2}) ); `ifdef DBG_CORE_REQ_INFO @@ -587,7 +587,7 @@ module VX_bank #( // Broadcast .is_fill_st1 (is_fill_st1[STAGE_1_CYCLES-1]), .fill_addr_st1 (addr_st1e), - .pending_hazard (mrvq_pending_hazard_st1e), + .pending_hazard_st1 (mrvq_pending_hazard_st1e), // Dequeue .miss_resrv_pop (mrvq_pop), diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 2f1f3813..86db8357 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -133,7 +133,7 @@ module VX_cache #( wire debug_core_req_wb; wire[`NR_BITS-1:0] debug_core_req_rd; wire[`NW_BITS-1:0] debug_core_req_warp_num; - wire[`LOG2UP(CREQ_SIZE)-1:0] debug_core_req_idx; + wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_core_req_idx; /* verilator lint_on UNUSED */ if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin diff --git a/hw/rtl/cache/VX_cache_miss_resrv.v b/hw/rtl/cache/VX_cache_miss_resrv.v index b1f7a7e5..f7bed651 100644 --- a/hw/rtl/cache/VX_cache_miss_resrv.v +++ b/hw/rtl/cache/VX_cache_miss_resrv.v @@ -41,7 +41,7 @@ module VX_cache_miss_resrv #( input wire is_fill_st1, input wire[`LINE_ADDR_WIDTH-1:0] fill_addr_st1, - output wire pending_hazard, + output wire pending_hazard_st1, // Miss dequeue input wire miss_resrv_pop, @@ -84,7 +84,7 @@ module VX_cache_miss_resrv #( assign make_ready[i] = is_fill_st1 && valid_address_match[i]; end - assign pending_hazard = |(valid_address_match); + assign pending_hazard_st1 = |(valid_address_match); wire dequeue_possible = valid_table[schedule_ptr] && ready_table[schedule_ptr]; wire [`LOG2UP(MRVQ_SIZE)-1:0] dequeue_index = schedule_ptr; diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index 0856ff0d..11d3266e 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -41,8 +41,8 @@ module VX_snp_forwarder #( reg [`REQS_BITS:0] pending_cntrs [SNRQ_SIZE-1:0]; - wire [`LOG2UP(SNRQ_SIZE)-1:0] sfq_write_addr, sfq_read_addr, dbg_sfq_write_addr; - wire sfq_push, sfq_pop, sfq_full; + wire [`LOG2UP(SNRQ_SIZE)-1:0] sfq_write_addr, sfq_read_addr; + wire sfq_acquire, sfq_release, sfq_full; wire fwdin_valid; wire [`LOG2UP(SNRQ_SIZE)-1:0] fwdin_tag; @@ -56,27 +56,26 @@ module VX_snp_forwarder #( assign sfq_read_addr = fwdin_tag; - assign sfq_push = snp_req_valid && !sfq_full && fwdout_ready; - assign sfq_pop = snp_rsp_valid; + assign sfq_acquire = snp_req_valid && !sfq_full && fwdout_ready; + assign sfq_release = snp_rsp_valid; - VX_index_queue #( - .DATAW (`LOG2UP(SNRQ_SIZE) + 1 +`DRAM_ADDR_WIDTH+SNP_REQ_TAG_WIDTH), + VX_cam_buffer #( + .DATAW (`DRAM_ADDR_WIDTH + 1 + SNP_REQ_TAG_WIDTH), .SIZE (SNRQ_SIZE) - ) snp_fwd_queue ( - .clk (clk), - .reset (reset), - .write_data ({snp_req_addr, snp_req_invalidate, snp_req_tag}), - .write_addr (sfq_write_addr), - .push (sfq_push), - .pop (sfq_pop), - .full (sfq_full), - .read_addr (sfq_read_addr), - .read_data ({snp_rsp_addr, snp_rsp_invalidate, snp_rsp_tag}), - `UNUSED_PIN (empty) + ) snp_fwd_buffer ( + .clk (clk), + .reset (reset), + .write_data ({snp_req_addr, snp_req_invalidate, snp_req_tag}), + .write_addr (sfq_write_addr), + .acquire_slot (sfq_acquire), + .release_slot (sfq_release), + .read_addr (sfq_read_addr), + .read_data ({snp_rsp_addr, snp_rsp_invalidate, snp_rsp_tag}), + .full (sfq_full) ); always @(posedge clk) begin - if (sfq_push) begin + if (sfq_acquire) begin pending_cntrs[sfq_write_addr] <= NUM_REQUESTS; end if (fwdin_fire) begin diff --git a/hw/rtl/cache/VX_tag_data_access.v b/hw/rtl/cache/VX_tag_data_access.v index 4009ac4e..d67837be 100644 --- a/hw/rtl/cache/VX_tag_data_access.v +++ b/hw/rtl/cache/VX_tag_data_access.v @@ -2,25 +2,26 @@ module VX_tag_data_access #( // Size of cache in bytes - parameter CACHE_SIZE = 0, + parameter CACHE_SIZE = 0, // Size of line inside a bank in bytes - parameter BANK_LINE_SIZE = 0, + parameter BANK_LINE_SIZE = 0, // Number of banks {1, 2, 4, 8,...} - parameter NUM_BANKS = 0, + parameter NUM_BANKS = 0, // Size of a word in bytes - parameter WORD_SIZE = 0, + parameter WORD_SIZE = 0, // Number of cycles to complete stage 1 (read from memory) - parameter STAGE_1_CYCLES = 0, + parameter STAGE_1_CYCLES = 0, // Enable cache writeable - parameter WRITE_ENABLE = 0, + parameter WRITE_ENABLE = 0, // Enable dram update - parameter DRAM_ENABLE = 0 + parameter DRAM_ENABLE = 0 ) ( input wire clk, input wire reset, + input wire stall, input wire is_snp_st1e, input wire snp_invalidate_st1e, @@ -78,17 +79,17 @@ module VX_tag_data_access #( wire tags_match; wire real_writefill = valid_req_st1e && writefill_st1e - && ((!use_read_valid_st1e) || (use_read_valid_st1e && !tags_match)); + && ((~use_read_valid_st1e) || (use_read_valid_st1e && ~tags_match)); wire[`TAG_SELECT_BITS-1:0] writetag_st1e = writeaddr_st1e[`TAG_LINE_ADDR_RNG]; wire[`LINE_SELECT_BITS-1:0] writeladdr_st1e = writeaddr_st1e[`LINE_SELECT_BITS-1:0]; - VX_tag_data_structure #( + VX_tag_data_store #( .CACHE_SIZE (CACHE_SIZE), .BANK_LINE_SIZE (BANK_LINE_SIZE), .NUM_BANKS (NUM_BANKS), .WORD_SIZE (WORD_SIZE) - ) tag_data_structure ( + ) tag_data_store ( .clk (clk), .reset (reset), .stall_bank_pipe(stall_bank_pipe), @@ -141,10 +142,12 @@ module VX_tag_data_access #( assign use_read_dirtyb_st1e= read_dirtyb_st1c[STAGE_1_CYCLES-1]; assign use_read_data_st1e = read_data_st1c[STAGE_1_CYCLES-1]; - if (`WORD_SELECT_WIDTH != 0) begin - assign readword_st1e = use_read_data_st1e[wordsel_st1e * `WORD_WIDTH +: `WORD_WIDTH]; - end else begin - assign readword_st1e = use_read_data_st1e; + for (i = 0; i < WORD_SIZE; i++) begin + if (`WORD_SELECT_WIDTH != 0) begin + assign readword_st1e[i * 8 +: 8] = use_read_data_st1e[wordsel_st1e * `WORD_WIDTH +: `WORD_WIDTH][i * 8 +: 8] & {8{mem_byteen_st1e[i]}}; + end else begin + assign readword_st1e[i * 8 +: 8] = use_read_data_st1e[i * 8 +: 8] & {8{mem_byteen_st1e[i]}}; + end end wire [`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0] we; @@ -153,9 +156,9 @@ module VX_tag_data_access #( wire should_write = mem_rw_st1e && valid_req_st1e && use_read_valid_st1e - && !miss_st1e - && !is_snp_st1e - && !real_writefill; + && ~miss_st1e + && ~is_snp_st1e + && ~real_writefill; for (i = 0; i < `BANK_LINE_WORDS; i++) begin wire normal_write = ((`WORD_SELECT_WIDTH == 0) || (wordsel_st1e == `UP(`WORD_SELECT_WIDTH)'(i))) @@ -168,22 +171,22 @@ module VX_tag_data_access #( assign data_write[i * `WORD_WIDTH +: `WORD_WIDTH] = real_writefill ? writedata_st1e[i * `WORD_WIDTH +: `WORD_WIDTH] : writeword_st1e; end - assign use_write_enable = (writefill_st1e && !real_writefill) ? 0 : we; + assign use_write_enable = (writefill_st1e && ~real_writefill) ? 0 : we; assign use_write_data = data_write; // use "case equality" to handle uninitialized tag when block entry is not valid assign tags_match = (writetag_st1e === use_read_tag_st1e); - wire snoop_hit_no_pending = valid_req_st1e && is_snp_st1e && use_read_valid_st1e && tags_match && (use_read_dirty_st1e || snp_invalidate_st1e) && !force_request_miss_st1e; - wire req_invalid = valid_req_st1e && !is_snp_st1e && !use_read_valid_st1e && !writefill_st1e; - wire req_miss = valid_req_st1e && !is_snp_st1e && use_read_valid_st1e && !writefill_st1e && !tags_match; + wire snoop_hit_no_pending = valid_req_st1e && is_snp_st1e && use_read_valid_st1e && tags_match && (use_read_dirty_st1e || snp_invalidate_st1e) && ~force_request_miss_st1e; + wire req_invalid = valid_req_st1e && ~is_snp_st1e && ~use_read_valid_st1e && ~writefill_st1e; + wire req_miss = valid_req_st1e && ~is_snp_st1e && use_read_valid_st1e && ~writefill_st1e && ~tags_match; wire real_miss = req_invalid || req_miss; - wire force_core_miss = (force_request_miss_st1e && !is_snp_st1e && !writefill_st1e && valid_req_st1e && !real_miss); + wire force_core_miss = (force_request_miss_st1e && ~is_snp_st1e && ~writefill_st1e && valid_req_st1e && ~real_miss); assign snp_to_mrvq_st1e = valid_req_st1e && is_snp_st1e && force_request_miss_st1e; // The second term is basically saying always make an entry ready if there's already antoher entry waiting, even if you yourself see a miss assign mrvq_init_ready_state_st1e = snp_to_mrvq_st1e - || (force_request_miss_st1e && !is_snp_st1e && !writefill_st1e && valid_req_st1e); + || (force_request_miss_st1e && ~is_snp_st1e && ~writefill_st1e && valid_req_st1e); assign miss_st1e = real_miss || snoop_hit_no_pending || force_core_miss; assign dirty_st1e = valid_req_st1e && use_read_valid_st1e && use_read_dirty_st1e; @@ -194,7 +197,4 @@ module VX_tag_data_access #( assign fill_saw_dirty_st1e = real_writefill && dirty_st1e; assign invalidate_line = snoop_hit_no_pending; -endmodule - - - +endmodule \ No newline at end of file diff --git a/hw/rtl/cache/VX_tag_data_structure.v b/hw/rtl/cache/VX_tag_data_store.v similarity index 99% rename from hw/rtl/cache/VX_tag_data_structure.v rename to hw/rtl/cache/VX_tag_data_store.v index 1d32f37d..72a62a7b 100644 --- a/hw/rtl/cache/VX_tag_data_structure.v +++ b/hw/rtl/cache/VX_tag_data_store.v @@ -1,6 +1,6 @@ `include "VX_cache_config.vh" -module VX_tag_data_structure #( +module VX_tag_data_store #( // Size of cache in bytes parameter CACHE_SIZE = 0, // Size of line inside a bank in bytes diff --git a/hw/rtl/interfaces/VX_alu_req_if.v b/hw/rtl/interfaces/VX_alu_req_if.v index 98dca661..901a2754 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.v +++ b/hw/rtl/interfaces/VX_alu_req_if.v @@ -5,18 +5,17 @@ interface VX_alu_req_if (); - wire [`NUM_THREADS-1:0] valid; + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NUM_THREADS-1:0] thread_mask; wire [`NW_BITS-1:0] warp_num; wire [31:0] curr_PC; - - wire [`ALU_BITS-1:0] alu_op; - wire wb; - wire [`NR_BITS-1:0] rd; + wire [`ALU_BITS-1:0] alu_op; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; - + wire [31:0] offset; wire [31:0] next_PC; diff --git a/hw/rtl/interfaces/VX_commit_if.v b/hw/rtl/interfaces/VX_commit_if.v index 9bf12884..c0c60df8 100644 --- a/hw/rtl/interfaces/VX_commit_if.v +++ b/hw/rtl/interfaces/VX_commit_if.v @@ -5,12 +5,9 @@ interface VX_commit_if (); - wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire [31:0] curr_PC; - wire [`NUM_THREADS-1:0][31:0] data; - wire [`NR_BITS-1:0] rd; - wire wb; + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NUM_THREADS-1:0][31:0] data; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_commit_is_if.v b/hw/rtl/interfaces/VX_commit_is_if.v new file mode 100644 index 00000000..da6aea23 --- /dev/null +++ b/hw/rtl/interfaces/VX_commit_is_if.v @@ -0,0 +1,43 @@ +`ifndef VX_COMMIT_IS_IF +`define VX_COMMIT_IS_IF + +`include "VX_define.vh" + +typedef struct packed { + logic [`NW_BITS-1:0] warp_num; + logic [`NUM_THREADS-1:0] thread_mask; + logic [31:0] curr_PC; + logic [`NR_BITS-1:0] rd; + logic rd_is_fp; + logic wb; +} is_data_t; + + +interface VX_commit_is_if (); + + wire alu_valid; + wire lsu_valid; + wire csr_valid; + wire mul_valid; + wire fpu_valid; + wire gpu_valid; + + wire [`ISTAG_BITS-1:0] alu_tag; + wire [`ISTAG_BITS-1:0] lsu_tag; + wire [`ISTAG_BITS-1:0] csr_tag; + wire [`ISTAG_BITS-1:0] mul_tag; + wire [`ISTAG_BITS-1:0] fpu_tag; + wire [`ISTAG_BITS-1:0] gpu_tag; + +`IGNORE_WARNINGS_BEGIN + is_data_t alu_data; + is_data_t lsu_data; + is_data_t csr_data; + is_data_t mul_data; + is_data_t fpu_data; + is_data_t gpu_data; +`IGNORE_WARNINGS_END + +endinterface + +`endif diff --git a/hw/rtl/interfaces/VX_csr_req_if.v b/hw/rtl/interfaces/VX_csr_req_if.v index e585ad5c..92c5b82a 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.v +++ b/hw/rtl/interfaces/VX_csr_req_if.v @@ -5,7 +5,8 @@ interface VX_csr_req_if (); - wire [`NUM_THREADS-1:0] valid; + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; wire [`NW_BITS-1:0] warp_num; wire [31:0] curr_PC; diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index ee7b4bf1..3b25581f 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -5,8 +5,9 @@ interface VX_decode_if (); - wire [`NUM_THREADS-1:0] valid; + wire valid; wire [`NW_BITS-1:0] warp_num; + wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; wire [31:0] next_PC; diff --git a/hw/rtl/interfaces/VX_fpu_from_csr_if.v b/hw/rtl/interfaces/VX_fpu_from_csr_if.v index 508d1a94..096b576c 100644 --- a/hw/rtl/interfaces/VX_fpu_from_csr_if.v +++ b/hw/rtl/interfaces/VX_fpu_from_csr_if.v @@ -3,6 +3,10 @@ `include "VX_define.vh" +`ifndef EXTF_F_ENABLE + `IGNORE_WARNINGS_BEGIN +`endif + interface VX_fpu_from_csr_if (); wire [`NW_BITS-1:0] warp_num; diff --git a/hw/rtl/interfaces/VX_fpu_req_if.v b/hw/rtl/interfaces/VX_fpu_req_if.v index 4d29fa8b..dd03f271 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.v +++ b/hw/rtl/interfaces/VX_fpu_req_if.v @@ -3,19 +3,19 @@ `include "VX_define.vh" +`ifndef EXTF_F_ENABLE + `IGNORE_WARNINGS_BEGIN +`endif + interface VX_fpu_req_if (); - wire [`NUM_THREADS-1:0] valid; + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; wire [`NW_BITS-1:0] warp_num; - wire [31:0] curr_PC; wire [`FPU_BITS-1:0] fpu_op; wire [`FRM_BITS-1:0] frm; - wire wb; - wire [`NR_BITS-1:0] rd; - wire rd_is_fp; - wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs3_data; diff --git a/hw/rtl/interfaces/VX_fpu_to_csr_if.v b/hw/rtl/interfaces/VX_fpu_to_csr_if.v index b4471632..71e1e7b9 100644 --- a/hw/rtl/interfaces/VX_fpu_to_csr_if.v +++ b/hw/rtl/interfaces/VX_fpu_to_csr_if.v @@ -3,6 +3,10 @@ `include "VX_define.vh" +`ifndef EXTF_F_ENABLE + `IGNORE_WARNINGS_BEGIN +`endif + interface VX_fpu_to_csr_if (); wire valid; diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v index 468fc073..38f36ab0 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -5,17 +5,18 @@ interface VX_gpu_req_if(); - wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire [31:0] curr_PC; + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NUM_THREADS-1:0] thread_mask; + wire [`NW_BITS-1:0] warp_num; - wire [`GPU_BITS-1:0] gpu_op; + wire [`GPU_BITS-1:0] gpu_op; wire [`NUM_THREADS-1:0][31:0] rs1_data; - wire [31:0] rs2_data; - wire [31:0] next_PC; + wire [31:0] rs2_data; + wire [31:0] next_PC; - wire ready; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_ifetch_req_if.v b/hw/rtl/interfaces/VX_ifetch_req_if.v index 82b31b6c..c92888eb 100644 --- a/hw/rtl/interfaces/VX_ifetch_req_if.v +++ b/hw/rtl/interfaces/VX_ifetch_req_if.v @@ -5,10 +5,11 @@ interface VX_ifetch_req_if (); - wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire [31:0] curr_PC; - wire ready; + wire valid; + wire [`NUM_THREADS-1:0] thread_mask; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.v b/hw/rtl/interfaces/VX_ifetch_rsp_if.v index 0bc482c2..b5efc4fc 100644 --- a/hw/rtl/interfaces/VX_ifetch_rsp_if.v +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.v @@ -5,11 +5,12 @@ interface VX_ifetch_rsp_if (); - wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire [31:0] curr_PC; - wire [31:0] instr; - wire ready; + wire valid; + wire [`NUM_THREADS-1:0] thread_mask; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; + wire [31:0] instr; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_lsu_req_if.v b/hw/rtl/interfaces/VX_lsu_req_if.v index a4e0aeed..b4b80598 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.v +++ b/hw/rtl/interfaces/VX_lsu_req_if.v @@ -5,20 +5,22 @@ interface VX_lsu_req_if (); - wire [`NUM_THREADS-1:0] valid; + wire valid; + wire [`NUM_THREADS-1:0] thread_mask; + wire [`ISTAG_BITS-1:0] issue_tag; wire [`NW_BITS-1:0] warp_num; wire [31:0] curr_PC; wire rw; wire [`BYTEEN_BITS-1:0] byteen; - - wire wb; - wire [`NR_BITS-1:0] rd; wire [`NUM_THREADS-1:0][31:0] store_data; wire [`NUM_THREADS-1:0][31:0] base_addr; wire [31:0] offset; - + + wire [`NR_BITS-1:0] rd; + wire wb; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_mul_req_if.v b/hw/rtl/interfaces/VX_mul_req_if.v index 708dba86..428edd94 100644 --- a/hw/rtl/interfaces/VX_mul_req_if.v +++ b/hw/rtl/interfaces/VX_mul_req_if.v @@ -3,21 +3,21 @@ `include "VX_define.vh" +`ifndef EXT_M_ENABLE + `IGNORE_WARNINGS_BEGIN +`endif + interface VX_mul_req_if (); - wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire [31:0] curr_PC; + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; - wire [`MUL_BITS-1:0] mul_op; - - wire wb; - wire [`NR_BITS-1:0] rd; + wire [`MUL_BITS-1:0] mul_op; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; - wire ready; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_warp_ctl_if.v b/hw/rtl/interfaces/VX_warp_ctl_if.v index 7eef29f0..50eca6b0 100644 --- a/hw/rtl/interfaces/VX_warp_ctl_if.v +++ b/hw/rtl/interfaces/VX_warp_ctl_if.v @@ -12,17 +12,14 @@ interface VX_warp_ctl_if (); wire wspawn; wire [31:0] wspawn_pc; - wire [`NUM_WARPS-1:0] wspawn_new_active; - - wire whalt; + wire [`NUM_WARPS-1:0] wspawn_wmask; wire is_barrier; wire [`NB_BITS-1:0] barrier_id; - wire [`NW_BITS:0] num_warps; + wire [`NW_BITS:0] barrier_num_warps; wire is_split; wire do_split; - wire [`NUM_THREADS-1:0] split_new_mask; wire [`NUM_THREADS-1:0] split_later_mask; wire [31:0] split_save_pc; diff --git a/hw/rtl/interfaces/VX_wb_if.v b/hw/rtl/interfaces/VX_wb_if.v index 968fec13..8baac2cd 100644 --- a/hw/rtl/interfaces/VX_wb_if.v +++ b/hw/rtl/interfaces/VX_wb_if.v @@ -5,7 +5,8 @@ interface VX_wb_if (); - wire [`NUM_THREADS-1:0] valid; + wire valid; + wire [`NUM_THREADS-1:0] thread_mask; wire [`NW_BITS-1:0] warp_num; wire [`NR_BITS-1:0] rd; wire rd_is_fp; diff --git a/hw/rtl/libs/VX_cam_buffer.v b/hw/rtl/libs/VX_cam_buffer.v new file mode 100644 index 00000000..284ffa7d --- /dev/null +++ b/hw/rtl/libs/VX_cam_buffer.v @@ -0,0 +1,74 @@ +`include "VX_define.vh" + +module VX_cam_buffer #( + parameter DATAW = 1, + parameter SIZE = 1, + parameter RPORTS = 1, + parameter ADDRW = `LOG2UP(SIZE) +) ( + input wire clk, + input wire reset, + input wire [DATAW-1:0] write_data, + output wire [ADDRW-1:0] write_addr, + input wire acquire_slot, + input wire [RPORTS-1:0][ADDRW-1:0] read_addr, + output reg [RPORTS-1:0][DATAW-1:0] read_data, + input wire [RPORTS-1:0] release_slot, + output wire full +); + reg [DATAW-1:0] entries [SIZE-1:0]; + reg [SIZE-1:0] free_slots, free_slots_n; + reg [ADDRW-1:0] write_addr_r; + reg full_r; + + wire free_valid; + wire [ADDRW-1:0] free_index; + + VX_priority_encoder #( + .N(SIZE) + ) free_slots_encoder ( + .data_in (free_slots_n), + .data_out (free_index), + .valid_out (free_valid) + ); + + integer i; + + always @(*) begin + free_slots_n = free_slots; + if (acquire_slot) begin + free_slots_n[write_addr_r] = 0; + end + for (i = 0; i < RPORTS; i++) begin + if (release_slot[i]) begin + free_slots_n[read_addr[i]] = 1; + end + assign read_data[i] = entries[read_addr[i]]; + end + end + + always @(posedge clk) begin + if (reset) begin + free_slots <= {SIZE{1'b1}}; + full_r <= 1'b0; + write_addr_r <= ADDRW'(1'b0); + end else begin + if (acquire_slot) begin + assert(1 == free_slots[write_addr]); + entries[write_addr] <= write_data; + end + for (i = 0; i < RPORTS; i++) begin + if (release_slot[i]) begin + assert(0 == free_slots[read_addr[i]]); + end + end + free_slots <= free_slots_n; + write_addr_r <= free_index; + full_r <= ~free_valid; + end + end + + assign write_addr = write_addr_r; + assign full = full_r; + +endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_divide.v b/hw/rtl/libs/VX_divide.v index d815dc39..83418f33 100644 --- a/hw/rtl/libs/VX_divide.v +++ b/hw/rtl/libs/VX_divide.v @@ -12,6 +12,7 @@ module VX_divide #( input wire clk, input wire reset, + input wire clk_en, input wire [WIDTHN-1:0] numer, input wire [WIDTHD-1:0] denom, @@ -31,7 +32,7 @@ module VX_divide #( .quotient (quotient_unqual), .remain (remainder_unqual), .aclr (1'b0), - .clken (1'b1) + .clken (clk_en) ); defparam @@ -43,8 +44,8 @@ module VX_divide #( quartus_div.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE", quartus_div.lpm_pipeline = PIPELINE; - assign quotient = quotient_unqual[WIDTHQ-1:0]; - assign remainder = remainder_unqual[WIDTHR-1:0]; + assign quotient = quotient_unqual [WIDTHQ-1:0]; + assign remainder = remainder_unqual [WIDTHR-1:0]; `else @@ -82,8 +83,8 @@ module VX_divide #( end if (PIPELINE == 0) begin - assign quotient = quotient_unqual[WIDTHQ-1:0]; - assign remainder = remainder_unqual[WIDTHR-1:0]; + assign quotient = quotient_unqual [WIDTHQ-1:0]; + assign remainder = remainder_unqual [WIDTHR-1:0]; end else begin reg [WIDTHN-1:0] quotient_pipe [0:PIPELINE-1]; reg [WIDTHD-1:0] remainder_pipe [0:PIPELINE-1]; @@ -95,14 +96,14 @@ module VX_divide #( quotient_pipe[i] <= 0; remainder_pipe[i] <= 0; end - else begin + else if (clk_en) begin if (i == 0) begin - quotient_pipe[0] <= quotient_unqual; - remainder_pipe[0] <= remainder_unqual; + quotient_pipe[i] <= quotient_unqual; + remainder_pipe[i] <= remainder_unqual; end else begin quotient_pipe[i] <= quotient_pipe[i-1]; - remainder_pipe[i] <= remainder_pipe[i-1]; - end + remainder_pipe[i] <= remainder_pipe[i-1]; + end end end end diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index a666948b..fce614fe 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -1,9 +1,9 @@ `include "VX_define.vh" module VX_generic_queue #( - parameter DATAW = 1, - parameter SIZE = 16, - parameter BUFFERED_OUTPUT = 1 + parameter DATAW = 1, + parameter SIZE = 16, + parameter BUFFERED = 1 ) ( input wire clk, input wire reset, @@ -58,7 +58,7 @@ module VX_generic_queue #( reg [DATAW-1:0] data [SIZE-1:0]; `endif - if (0 == BUFFERED_OUTPUT) begin + if (0 == BUFFERED) begin reg [`LOG2UP(SIZE):0] rd_ptr_r; reg [`LOG2UP(SIZE):0] wr_ptr_r; diff --git a/hw/rtl/libs/VX_mult.v b/hw/rtl/libs/VX_mult.v deleted file mode 100644 index 1dd77aea..00000000 --- a/hw/rtl/libs/VX_mult.v +++ /dev/null @@ -1,75 +0,0 @@ -`include "VX_define.vh" - -module VX_mult #( - parameter WIDTHA = 1, - parameter WIDTHB = 1, - parameter WIDTHP = 1, - parameter SIGNED = 0, - parameter PIPELINE = 0 -) ( - input wire clk, - input wire reset, - - input wire [WIDTHA-1:0] dataa, - input wire [WIDTHB-1:0] datab, - output wire [WIDTHP-1:0] result -); - -`ifdef QUARTUS - - lpm_mult quartus_mult ( - .clock (clk), - .dataa (dataa), - .datab (datab), - .result (result), - .sclr (reset), - .aclr (1'b0), - .clken (1'b1), - .sum (1'b0) - ); - - defparam quartus_mult.lpm_type = "LPM_MULT", - quartus_mult.lpm_widtha = WIDTHA, - quartus_mult.lpm_widthb = WIDTHB, - quartus_mult.lpm_widthp = WIDTHP, - quartus_mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED", - quartus_mult.lpm_pipeline = PIPELINE, - quartus_mult.lpm_hint = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9"; -`else - - wire [WIDTHP-1:0] result_unqual; - - if (SIGNED) begin - assign result_unqual = $signed(dataa) * $signed(datab); - end else begin - assign result_unqual = dataa * datab; - end - - if (PIPELINE == 0) begin - assign result = result_unqual; - end else begin - - reg [WIDTHP-1:0] result_pipe [0:PIPELINE-1]; - - genvar i; - for (i = 0; i < PIPELINE; i++) begin - always @(posedge clk) begin - if (reset) begin - result_pipe[i] <= 0; - end - else begin - if (i == 0) begin - result_pipe[0] <= result_unqual; - end else begin - result_pipe[i] <= result_pipe[i-1]; - end - end - end - end - - assign result = result_pipe[PIPELINE-1]; - end - -`endif - -endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_priority_encoder.v b/hw/rtl/libs/VX_priority_encoder.v index 24c91724..a62273a7 100644 --- a/hw/rtl/libs/VX_priority_encoder.v +++ b/hw/rtl/libs/VX_priority_encoder.v @@ -8,6 +8,7 @@ module VX_priority_encoder #( output reg valid_out ); integer i; + always @(*) begin data_out = 0; valid_out = 0; diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index c24bea0a..327d5aa4 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -15,7 +15,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_FLAGS += $(DBG_PRINT_FLAGS) -DBG_FLAGS += -DDBG_CORE_REQ_INFO +#DBG_FLAGS += -DDBG_CORE_REQ_INFO INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/fp_cores -I../rtl/simulate @@ -35,7 +35,6 @@ VF += -cc Vortex.v -top-module Vortex VF += verilator.vlt DBG += -DVCD_OUTPUT $(DBG_FLAGS) -DBG += -DDBG_CORE_REQ_INFO THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 3f9f240e..4ea6866a 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -11,7 +11,7 @@ double sc_time_stamp() { Simulator::Simulator() { // force random values for unitialized signals - Verilated::randReset(1); + Verilated::randReset(2); // Turn off assertion before reset Verilated::assertOn(false); @@ -24,7 +24,8 @@ Simulator::Simulator() { #ifdef VCD_OUTPUT Verilated::traceEverOn(true); - trace_ = new VerilatedVcdC; + trace_ = new VerilatedVcdC(); + trace_->set_time_unit("1ns"); vortex_->trace(trace_, 99); trace_->open("trace.vcd"); #endif diff --git a/runtime/src/vx_intrinsics.S b/runtime/src/vx_intrinsics.S index b0dbd628..c6b0008d 100644 --- a/runtime/src/vx_intrinsics.S +++ b/runtime/src/vx_intrinsics.S @@ -83,11 +83,11 @@ vx_num_cores: .type vx_num_cycles, @function .global vx_num_cycles vx_num_cycles: - csrr a0, CSR_CYCLL + csrr a0, CSR_CYCLE_L ret .type vx_num_instrs, @function .global vx_num_instrs vx_num_instrs: - csrr a0, CSR_INSTL + csrr a0, CSR_INSTR_L ret \ No newline at end of file diff --git a/runtime/src/vx_start.S b/runtime/src/vx_start.S index 029fbb90..62eb8025 100644 --- a/runtime/src/vx_start.S +++ b/runtime/src/vx_start.S @@ -49,7 +49,7 @@ vx_set_sp: slli a1, a1, 10 # multiply by 1024 csrr a2, CSR_LTID # get local thread id slli a2, a2, 2 # multiply by 4 - lui sp, STACK_BASE_ADDR # load base sp + lui sp, (SHARED_MEM_BASE_ADDR>>12) # load base sp sub sp, sp, a1 # sub thread block add sp, sp, a2 # reduce addr collision for perf diff --git a/runtime/tests/simple/Makefile b/runtime/tests/simple/Makefile index db04672b..1c203ec7 100644 --- a/runtime/tests/simple/Makefile +++ b/runtime/tests/simple/Makefile @@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy CFLAGS += -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections -CFLAGS += -I$(VORTEX_RT_PATH)/include +CFLAGS += -I$(VORTEX_RT_PATH)/include -I../../../hw LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a diff --git a/runtime/tests/simple/main.c b/runtime/tests/simple/main.c index 7c5c40ab..1d177987 100644 --- a/runtime/tests/simple/main.c +++ b/runtime/tests/simple/main.c @@ -3,6 +3,7 @@ #include #include #include +#include typedef struct @@ -100,7 +101,7 @@ int main() test_wsapwn(); vx_print_str("Shared Memory test\n"); - unsigned * ptr = (unsigned *) 0xFFFF0000; + unsigned * ptr = (unsigned *) SHARED_MEM_BASE_ADDR; unsigned value = 0; for (int i = 0; i < 5; i++) { @@ -112,7 +113,6 @@ int main() vx_print_str("-------------------\n"); value++; ptr++; - } vx_print_str("vx_spawn_warps mat_add_kernel\n"); diff --git a/runtime/tests/simple/tests.c b/runtime/tests/simple/tests.c index 3ad7860f..1efd0a3a 100644 --- a/runtime/tests/simple/tests.c +++ b/runtime/tests/simple/tests.c @@ -125,16 +125,4 @@ void intrinsics_tests() // Test wspawn vx_print_str("test_spawn\n"); test_wsapwn(); -} - - - - - - - - - - - - +} \ No newline at end of file diff --git a/runtime/tests/simple/vx_simple.dump b/runtime/tests/simple/vx_simple.dump index 5175c11c..6ab5c371 100644 --- a/runtime/tests/simple/vx_simple.dump +++ b/runtime/tests/simple/vx_simple.dump @@ -81,14 +81,14 @@ Disassembly of section .text: 80000114: 3a8000ef jal ra,800004bc 80000118: 80001537 lui a0,0x80001 8000011c: ddc50513 addi a0,a0,-548 # 80000ddc <__global_pointer$+0xfffff5d4> -80000120: ffff0437 lui s0,0xffff0 +80000120: 6ffff437 lui s0,0x6ffff 80000124: 628000ef jal ra,8000074c 80000128: 00000493 li s1,0 8000012c: 80001b37 lui s6,0x80001 80000130: 80001ab7 lui s5,0x80001 80000134: 80001a37 lui s4,0x80001 80000138: 800019b7 lui s3,0x80001 -8000013c: 01440913 addi s2,s0,20 # ffff0014 <__global_pointer$+0x7ffee80c> +8000013c: 01440913 addi s2,s0,20 # 6ffff014 <_start-0x10000fec> 80000140: 00942023 sw s1,0(s0) 80000144: 00040593 mv a1,s0 80000148: df0b0513 addi a0,s6,-528 # 80000df0 <__global_pointer$+0xfffff5e8> diff --git a/runtime/tests/simple/vx_simple.elf b/runtime/tests/simple/vx_simple.elf index c4ce71a2..a7a3480a 100755 Binary files a/runtime/tests/simple/vx_simple.elf and b/runtime/tests/simple/vx_simple.elf differ diff --git a/runtime/tests/simple/vx_simple.hex b/runtime/tests/simple/vx_simple.hex index ba10b414..35888614 100644 --- a/runtime/tests/simple/vx_simple.hex +++ b/runtime/tests/simple/vx_simple.hex @@ -17,7 +17,7 @@ :1000E80037150080130585DBEF00C0651305400058 :1000F800EF00405EEF00402E13051000EF00805D1A :10010800371500801305C5DCEF00C063EF00803AA7 -:10011800371500801305C5DD3704FFFFEF00806247 +:10011800371500801305C5DD37F4FF6FEF008062E7 :1001280093040000371B0080B71A0080371A00803C :10013800B719008013094401232094009305040093 :1001480013050BDFEF00006E9385040013858ADF2B