diff --git a/driver/opae/Makefile b/driver/opae/Makefile index 09ab5d79..17572e34 100644 --- a/driver/opae/Makefile +++ b/driver/opae/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -I../include -I$(OPAE_HOME)/include -I../../hw LDFLAGS += -L$(OPAE_HOME)/lib -SCOPE=1 +#SCOPE=1 # stack execution protection LDFLAGS +=-z noexecstack @@ -32,8 +32,6 @@ ASE_LIBS += -luuid -lopae-c-ase VLSIM_LIBS += -lopae-c-vlsim -LIB_DIR=../lib - ASE_DIR = ase VLSIM_DIR = vlsim @@ -67,10 +65,10 @@ fpga: $(SRCS) asesim: $(SRCS) $(ASE_DIR) $(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $(PROJECT_ASE) -vlsim: $(SRCS) opae-vlsim - $(CXX) $(CXXFLAGS) -L./vlsim -DUSE_VLSIM $(SRCS) $(LDFLAGS) $(VLSIM_LIBS) -o $(PROJECT_VLSIM) +vlsim: $(SRCS) vlsim-hw + $(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -L./vlsim $(VLSIM_LIBS) -o $(PROJECT_VLSIM) -opae-vlsim: +vlsim-hw: $(SET_SCOPE) $(MAKE) -C vlsim vortex.o: vortex.cpp diff --git a/driver/opae/vlsim/Makefile b/driver/opae/vlsim/Makefile index e3b52ec7..251850cc 100644 --- a/driver/opae/vlsim/Makefile +++ b/driver/opae/vlsim/Makefile @@ -15,8 +15,8 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_CORE_REQ_INFO DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE -DBG_FLAGS += $(DBG_PRINT_FLAGS) -DBG_FLAGS += -DDBG_CORE_REQ_INFO +#DBG_FLAGS += $(DBG_PRINT_FLAGS) +#DBG_FLAGS += -DDBG_CORE_REQ_INFO #CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 @@ -72,9 +72,13 @@ ifdef SCOPE SCOPE_VH = $(RTL_DIR)/scope-defs.vh endif +# use our OPAE shim VL_FLAGS += -DNOPAE CFLAGS += -DNOPAE +# use DPI FPU +VL_FLAGS += -DFPU_FAST + RTL_INCLUDE += -I../../../hw/opae -I../../../hw/opae/ccip PROJECT = libopae-c-vlsim.so diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index 703aca7c..bc57aa39 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -31,7 +31,7 @@ fpga_result res = _expr; \ if (res == FPGA_OK) \ break; \ - printf("OPAE Error: '%s' returned %d, %s!\n", \ + printf("[VXDRV] Error: '%s' returned %d, %s!\n", \ #_expr, (int)res, fpgaErrStr(res)); \ return -1; \ } while (false) @@ -118,7 +118,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { *value = STARTUP_ADDR; break; default: - fprintf(stderr, "invalid caps id: %d\n", caps_id); + fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id); std::abort(); return -1; } @@ -156,7 +156,7 @@ extern int vx_dev_open(vx_device_h* hdevice) { fpgaDestroyProperties(&filter); if (num_matches < 1) { - fprintf(stderr, "Accelerator %s not found!\n", AFU_ACCEL_UUID); + fprintf(stderr, "[VXDRV] Error: accelerator %s not found!\n", AFU_ACCEL_UUID); return -1; } @@ -197,9 +197,10 @@ extern int vx_dev_open(vx_device_h* hdevice) { fpgaClose(accel_handle); return ret; } - - fprintf(stdout, "DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n", + #ifndef NDEBUG + fprintf(stdout, "[VXDRV] DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n", device->implementation_id, device->num_cores, device->num_warps, device->num_threads); + #endif } #ifdef SCOPE @@ -236,18 +237,18 @@ extern int vx_dev_close(vx_device_h hdevice) { int ret = vx_get_perf(hdevice, core_id, &instrs, &cycles); assert(ret == 0); float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); + fprintf(stdout, "[VXDRV] PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); total_instrs += instrs; total_cycles = std::max(total_cycles, cycles); } float IPC = (float)(double(total_instrs) / double(total_cycles)); - fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); + fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); } else { uint64_t instrs, cycles; int ret = vx_get_perf(hdevice, 0, &instrs, &cycles); float IPC = (float)(double(instrs) / double(cycles)); assert(ret == 0); - fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); + fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); } #endif @@ -373,7 +374,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &data)); if (0 == data || 0 == timeout) { if (data != 0) { - fprintf(stdout, "ready-wait timed out: status=%ld\n", data); + fprintf(stdout, "[VXDRV] ready-wait timed out: status=%ld\n", data); } break; } diff --git a/driver/tests/dogfood/dogfood.cpp b/driver/tests/dogfood/dogfood.cpp index 3cbd5fea..5d2e6016 100644 --- a/driver/tests/dogfood/dogfood.cpp +++ b/driver/tests/dogfood/dogfood.cpp @@ -90,16 +90,20 @@ vx_buffer_h dst_buf = nullptr; static void show_usage() { std::cout << "Vortex Driver Test." << std::endl; - std::cout << "Usage: [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl; + std::cout << "Usage: [-t:testid] [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl; } static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "n:s:e:k:ch?")) != -1) { + while ((c = getopt(argc, argv, "n:t:s:e:k:ch?")) != -1) { switch (c) { case 'n': count = atoi(optarg); break; + case 't': + testid_s = atoi(optarg); + testid_e = atoi(optarg); + break; case 's': testid_s = atoi(optarg); break; diff --git a/hw/opae/README b/hw/opae/README index 5765123b..84e08e88 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -60,9 +60,9 @@ qsub-sim make ase # tests -./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -t1 -n1 +./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n16 ./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16 -./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n1 -s4 -e4 +./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n16 ./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd # modify "vsim_run.tcl" to dump VCD trace @@ -97,7 +97,7 @@ kill -9 # fixing device resource busy issue when deleting /build_ase_1c/ lsof +D build_ase_1c -# quick off cache synthesis +# quick off synthesis make -C pipeline clean && make -C pipeline > pipeline/build.log 2>&1 & make -C cache clean && make -C cache > cache/build.log 2>&1 & make -C core clean && make -C core > core/build.log 2>&1 & diff --git a/hw/opae/sources_1c.txt b/hw/opae/sources_1c.txt index 5c63e9cd..897468c2 100644 --- a/hw/opae/sources_1c.txt +++ b/hw/opae/sources_1c.txt @@ -3,7 +3,7 @@ +define+SYNTHESIS +define+QUARTUS +define+FPU_FAST -+define+SCOPE +#+define+SCOPE #+define+DBG_PRINT_CORE_ICACHE #+define+DBG_PRINT_CORE_DCACHE diff --git a/hw/opae/vortex_afu.qsf b/hw/opae/vortex_afu.qsf index 1356ecb4..f515b639 100644 --- a/hw/opae/vortex_afu.qsf +++ b/hw/opae/vortex_afu.qsf @@ -6,4 +6,21 @@ set_global_assignment -name VERILOG_MACRO QUARTUS set_global_assignment -name VERILOG_MACRO SYNTHESIS set_global_assignment -name VERILOG_MACRO NDEBUG set_global_assignment -name MESSAGE_DISABLE 16818 -set_global_assignment -name VERILOG_MACRO FPU_FAST \ No newline at end of file +set_global_assignment -name VERILOG_MACRO FPU_FAST + +set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0 +set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100 +set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)" +set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON +set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON +set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON +set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON +set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON +set_global_assignment -name POWER_USE_TA_VALUE 65 +set_global_assignment -name SEED 1 +set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON +set_global_assignment -name FITTER_EFFORT "STANDARD FIT" +set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" +set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED +set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM +set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" \ No newline at end of file diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 49783361..e23c4caf 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -175,8 +175,9 @@ logic [31:0] cmd_csr_wdata; // MMIO controller //////////////////////////////////////////////////////////// `IGNORE_WARNINGS_BEGIN -t_ccip_c0_ReqMmioHdr mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr); +t_ccip_c0_ReqMmioHdr mmio_hdr; `IGNORE_WARNINGS_END +assign mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr); `STATIC_ASSERT(($bits(t_ccip_c0_ReqMmioHdr)-$bits(mmio_hdr.address)) == 12, ("Oops!")) @@ -204,9 +205,20 @@ wire [2:0] cmd_type = (cp2af_sRxPort.c0.mmioWrValid && (MMIO_CMD_TYPE == mmio_hd reg scope_start; `endif +// disable assertions until reset +`ifndef VERILATOR +initial begin + $assertoff; +end +`endif + always_ff @(posedge clk) begin if (reset) begin + `ifndef VERILATOR + $asserton; // enable assertions + `endif + mmio_tx.hdr <= 0; mmio_tx.data <= 0; mmio_tx.mmioRdValid <= 0; @@ -324,7 +336,8 @@ begin end `endif default: begin - `ifdef DBG_PRINT_OPAE + mmio_tx.data <= 64'h0; + `ifdef DBG_PRINT_OPAE $display("%t: Unknown MMIO Rd: addr=%0h", $time, mmio_hdr.address); `endif end diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 26470373..9f4ff5f7 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -59,8 +59,6 @@ `define EXT_F_ENABLE `endif -//`define FPU_FAST - // Device identification `define VENDOR_ID 0 `define ARCHITECTURE_ID 0 diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 17c3d7dd..77a73b9f 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -347,7 +347,7 @@ module VX_decode #( assign decode_if.rd = rd; assign decode_if.rs1 = rs1_qual; assign decode_if.rs2 = rs2; - assign decode_if.rs3 = rs3; + assign decode_if.rs3 = 0; `endif assign decode_if.use_rs3 = use_rs3; diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v deleted file mode 100644 index c40df875..00000000 --- a/hw/rtl/VX_gpr_fp_ctrl.v +++ /dev/null @@ -1,70 +0,0 @@ -`include "VX_define.vh" - -// control module to support multi-cycle read for fp register - -module VX_gpr_fp_ctrl ( - input wire clk, - input wire reset, - - input wire [`NUM_THREADS-1:0][31:0] rs1_data, - input wire [`NUM_THREADS-1:0][31:0] rs2_data, - VX_gpr_req_if gpr_req_if, - - // outputs - output wire [`NW_BITS+`NR_BITS-1:0] raddr1, - VX_gpr_rsp_if gpr_rsp_if -); - - reg [`NUM_THREADS-1:0][31:0] rsp_rs1_data, rsp_rs2_data, rsp_rs3_data; - reg rsp_valid; - reg [31:0] rsp_pc; - reg [`NW_BITS-1:0] rsp_wid; - reg read_rs1; - - wire rs3_delay = gpr_req_if.valid && gpr_req_if.use_rs3 && read_rs1; - wire read_fire = gpr_req_if.valid && gpr_rsp_if.ready; - - always @(posedge clk) begin - if (reset) begin - rsp_valid <= 0; - rsp_pc <= 0; - rsp_rs1_data <= 0; - rsp_rs2_data <= 0; - rsp_rs3_data <= 0; - rsp_wid <= 0; - read_rs1 <= 1; - end else begin - if (rs3_delay) begin - read_rs1 <= 0; - rsp_wid <= gpr_req_if.wid; - end else if (read_fire) begin - read_rs1 <= 1; - end - - rsp_valid <= gpr_req_if.valid; - rsp_wid <= gpr_req_if.wid; - rsp_pc <= gpr_req_if.PC; - - if (read_rs1) begin - rsp_rs1_data <= (gpr_req_if.rs1 == 0) ? (`NUM_THREADS*32)'(0) : rs1_data; - end - rsp_rs2_data <= (gpr_req_if.rs2 == 0) ? (`NUM_THREADS*32)'(0) : rs2_data; - rsp_rs3_data <= (gpr_req_if.rs1 == 0) ? (`NUM_THREADS*32)'(0) : rs1_data; - - assert(read_rs1 || rsp_wid == gpr_req_if.wid); - end - end - - // outputs - wire [`NR_BITS-1:0] rs1 = read_rs1 ? gpr_req_if.rs1 : gpr_req_if.rs3; - assign raddr1 = {gpr_req_if.wid, rs1}; - assign gpr_req_if.ready = ~rs3_delay; - - assign gpr_rsp_if.valid = rsp_valid; - assign gpr_rsp_if.wid = rsp_wid; - assign gpr_rsp_if.PC = rsp_pc; - assign gpr_rsp_if.rs1_data = rsp_rs1_data; - assign gpr_rsp_if.rs2_data = rsp_rs2_data; - assign gpr_rsp_if.rs3_data = rsp_rs3_data; - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_ram.v b/hw/rtl/VX_gpr_ram.v index 352a17e0..f60f1964 100644 --- a/hw/rtl/VX_gpr_ram.v +++ b/hw/rtl/VX_gpr_ram.v @@ -12,21 +12,24 @@ module VX_gpr_ram ( ); `ifndef ASIC - reg [`NUM_THREADS-1:0][3:0][7:0] ram [(`NUM_WARPS * `NUM_REGS)-1:0]; + reg [`NUM_THREADS-1:0][3:0][7:0] mem [(`NUM_WARPS * `NUM_REGS)-1:0]; + reg [`NUM_THREADS-1:0][31:0] q1, q2; always @(posedge clk) begin for (integer i = 0; i < `NUM_THREADS; i++) begin if (we[i]) begin - ram[waddr][i][0] <= wdata[i][07:00]; - ram[waddr][i][1] <= wdata[i][15:08]; - ram[waddr][i][2] <= wdata[i][23:16]; - ram[waddr][i][3] <= wdata[i][31:24]; + mem[waddr][i][0] <= wdata[i][07:00]; + mem[waddr][i][1] <= wdata[i][15:08]; + mem[waddr][i][2] <= wdata[i][23:16]; + mem[waddr][i][3] <= wdata[i][31:24]; end end + q1 <= mem[rs1]; + q2 <= mem[rs2]; end - - assign rs1_data = ram[rs1]; - assign rs2_data = ram[rs2]; + + assign rs1_data = q1; + assign rs2_data = q2; `else @@ -134,4 +137,4 @@ module VX_gpr_ram ( `endif -endmodule +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 23d9db16..9e72c023 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -15,8 +15,15 @@ module VX_gpr_stage #( ); `UNUSED_VAR (reset) + reg rsp_valid; + reg [`NW_BITS-1:0] rsp_wid; + reg [31:0] rsp_pc; + reg rs1_is_zero, rs2_is_zero; + wire [`NUM_THREADS-1:0][31:0] rs1_data, rs2_data; - wire [`NW_BITS+`NR_BITS-1:0] raddr1; + wire [`NW_BITS+`NR_BITS-1:0] raddr1, raddr2; + + assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2}; VX_gpr_ram gpr_ram ( .clk (clk), @@ -24,60 +31,77 @@ module VX_gpr_stage #( .waddr ({writeback_if.wid, writeback_if.rd}), .wdata (writeback_if.data), .rs1 (raddr1), - .rs2 ({gpr_req_if.wid, gpr_req_if.rs2}), + .rs2 (raddr2), .rs1_data (rs1_data), .rs2_data (rs2_data) - ); + ); -`ifdef EXT_F_ENABLE - VX_gpr_fp_ctrl VX_gpr_fp_ctrl ( - .clk (clk), - .reset (reset), - .rs1_data (rs1_data), - .rs2_data (rs2_data), - .raddr1 (raddr1), - .gpr_req_if (gpr_req_if), - .gpr_rsp_if (gpr_rsp_if) - ); -`else - reg [`NUM_THREADS-1:0][31:0] rsp_rs1_data, rsp_rs2_data; - reg rsp_valid; - reg [`NW_BITS-1:0] rsp_wid; - reg [31:0] rsp_pc; - always @(posedge clk) begin if (reset) begin - rsp_valid <= 0; - rsp_wid <= 0; - rsp_pc <= 0; - rsp_rs1_data <= 0; - rsp_rs2_data <= 0; + rsp_valid <= 0; + rsp_wid <= 0; + rsp_pc <= 0; + rs1_is_zero <= 0; + rs2_is_zero <= 0; end else begin - rsp_valid <= gpr_req_if.valid; - rsp_wid <= gpr_req_if.wid; - rsp_pc <= gpr_req_if.PC; - rsp_rs1_data <= (gpr_req_if.rs1 == 0) ? (`NUM_THREADS*32)'(0) : rs1_data; - rsp_rs2_data <= (gpr_req_if.rs2 == 0) ? (`NUM_THREADS*32)'(0) : rs2_data; + rsp_valid <= gpr_req_if.valid; + rsp_wid <= gpr_req_if.wid; + rsp_pc <= gpr_req_if.PC; + rs1_is_zero <= (0 == gpr_req_if.rs1); + rs2_is_zero <= (0 == gpr_req_if.rs2); end + end + +`ifdef EXT_F_ENABLE + + reg [`NUM_THREADS-1:0][31:0] rs3_data; + reg read_rs3, save_rs3; + + wire rs3_delay = gpr_req_if.valid && gpr_req_if.use_rs3 && !read_rs3; + wire read_fire = gpr_req_if.valid && gpr_rsp_if.ready; + + always @(posedge clk) begin + if (reset) begin + rs3_data <= 0; + read_rs3 <= 0; + end else begin + if (rs3_delay) begin + read_rs3 <= 1; + save_rs3 <= 1; + end else if (read_fire) begin + read_rs3 <= 0; + end + if (save_rs3) begin + rs3_data <= rs1_data; + save_rs3 <= 0; + end + assert(!read_rs3 || rsp_wid == gpr_req_if.wid); + end end + assign raddr1 = {gpr_req_if.wid, (rs3_delay ? gpr_req_if.rs3 : gpr_req_if.rs1)}; + assign gpr_req_if.ready = ~rs3_delay; + assign gpr_rsp_if.rs3_data = rs3_data; + +`else + assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1}; - assign gpr_req_if.ready = 1; - - assign gpr_rsp_if.valid = rsp_valid; - assign gpr_rsp_if.wid = rsp_wid; - assign gpr_rsp_if.PC = rsp_pc; - assign gpr_rsp_if.rs1_data = rsp_rs1_data; - assign gpr_rsp_if.rs2_data = rsp_rs2_data; assign gpr_rsp_if.rs3_data = 0; `UNUSED_VAR (gpr_req_if.valid); `UNUSED_VAR (gpr_req_if.rs3); `UNUSED_VAR (gpr_req_if.use_rs3); `UNUSED_VAR (gpr_rsp_if.ready); + `endif + + assign gpr_rsp_if.rs1_data = rs1_is_zero ? (`NUM_THREADS*32)'(0) : rs1_data; + assign gpr_rsp_if.rs2_data = rs2_is_zero ? (`NUM_THREADS*32)'(0) : rs2_data; + assign gpr_rsp_if.valid = rsp_valid; + assign gpr_rsp_if.wid = rsp_wid; + assign gpr_rsp_if.PC = rsp_pc; assign writeback_if.ready = 1'b1; -endmodule +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 8c712eff..b3e6fd9d 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -20,13 +20,13 @@ module VX_ibuffer #( localparam ADDRW = $clog2(SIZE); localparam NWARPSW = $clog2(`NUM_WARPS+1); + reg [SIZEW-1:0] size_r [`NUM_WARPS-1:0]; + wire [`NUM_WARPS-1:0] q_full; wire [`NUM_WARPS-1:0][SIZEW-1:0] q_size; wire [DATAW-1:0] q_data_in; - wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev; - + wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev; reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out; - reg [SIZEW-1:0] size_r [`NUM_WARPS-1:0]; wire enq_fire = ibuf_enq_if.valid && ibuf_enq_if.ready; wire deq_fire = ibuf_deq_if.valid && ibuf_deq_if.ready; @@ -36,7 +36,7 @@ module VX_ibuffer #( wire writing = enq_fire && (i == ibuf_enq_if.wid); wire reading = deq_fire && (i == ibuf_deq_if.wid); - wire is_slot0 = ((0 == size_r[i]) || ((1 == size_r[i]) && reading)); + wire is_slot0 = (0 == size_r[i]) || ((1 == size_r[i]) && reading); wire push = writing && !is_slot0; wire pop = reading && (size_r[i] != 1); @@ -48,36 +48,37 @@ module VX_ibuffer #( .clk (clk), .reset (reset), .push (push), - .data_in (q_data_in), .pop (pop), + .data_in (q_data_in), .data_out (q_data_prev[i]), `UNUSED_PIN (empty), `UNUSED_PIN (full), `UNUSED_PIN (size) ); - always @(posedge clk) begin - if (writing && is_slot0) begin - q_data_out[i] <= q_data_in; - end - if (pop) begin - q_data_out[i] <= q_data_prev[i]; - end - end - always @(posedge clk) begin if (reset) begin size_r[i] <= 0; - end else begin - if (writing && !reading) begin - size_r[i] <= size_r[i] + SIZEW'(1); + end else begin + if (writing) begin + if (is_slot0) begin + q_data_out[i] <= q_data_in; + end + if (!reading) begin + size_r[i] <= size_r[i] + SIZEW'(1); + end end - if (reading && !writing) begin - size_r[i] <= size_r[i] - SIZEW'(1); + if (reading) begin + if (size_r[i] != 1) begin + q_data_out[i] <= q_data_prev[i]; + end + if (!writing) begin + size_r[i] <= size_r[i] - SIZEW'(1); + end end end end - + assign q_full[i] = (size_r[i] == SIZE); assign q_size[i] = size_r[i]; end diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index e0bf94af..1b957271 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -29,7 +29,7 @@ module VX_icache_stage #( wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[0][`NW_BITS-1:0]; always @(posedge clk) begin - if (icache_req_fire) begin + if (icache_req_fire) begin rsp_PC_buf[req_tag] <= ifetch_req_if.PC; rsp_tmask_buf[req_tag] <= ifetch_req_if.tmask; end diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index a377c461..d30120dd 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -41,9 +41,9 @@ `define STRINGIFY(x) `"x`" -`define STATIC_ASSERT(cond, msg) \ - generate \ - if (!(cond)) $error msg; \ +`define STATIC_ASSERT(cond, msg) \ + generate \ + if (!(cond)) $error msg; \ endgenerate `define ENABLE_TRACING /* verilator tracing_on */ @@ -51,8 +51,8 @@ /////////////////////////////////////////////////////////////////////////////// -`define USE_FAST_BRAM (* syn_ramstyle = "mlab" *) -`define RELAXED_RW_BRAM (* syn_ramstyle = "no_rw_check" *) +`define USE_FAST_BRAM (* ramstyle="mlab" *) +`define NO_RW_RAM_CHECK (* ramstyle="no_rw_check" *) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 625c0e53..35f99563 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -447,6 +447,8 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = inst_meta_st1; + end else begin + assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = 0; end `endif diff --git a/hw/rtl/cache/VX_cache_miss_resrv.v b/hw/rtl/cache/VX_cache_miss_resrv.v index 74745ceb..9f201223 100644 --- a/hw/rtl/cache/VX_cache_miss_resrv.v +++ b/hw/rtl/cache/VX_cache_miss_resrv.v @@ -58,6 +58,7 @@ module VX_cache_miss_resrv #( ); reg [`MRVQ_METADATA_WIDTH-1:0] metadata_table[MRVQ_SIZE-1:0]; reg [MRVQ_SIZE-1:0][`LINE_ADDR_WIDTH-1:0] addr_table; + reg [MRVQ_SIZE-1:0] valid_table; reg [MRVQ_SIZE-1:0] ready_table; reg [`LOG2UP(MRVQ_SIZE)-1:0] schedule_ptr; diff --git a/hw/rtl/cache/VX_tag_data_store.v b/hw/rtl/cache/VX_tag_data_store.v index e0f356cc..a03c890b 100644 --- a/hw/rtl/cache/VX_tag_data_store.v +++ b/hw/rtl/cache/VX_tag_data_store.v @@ -30,7 +30,6 @@ module VX_tag_data_store #( input wire fill_sent ); - reg [`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0][7:0] data [`BANK_LINE_COUNT-1:0]; reg [`TAG_SELECT_BITS-1:0] tag [`BANK_LINE_COUNT-1:0]; reg [`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0] dirtyb[`BANK_LINE_COUNT-1:0]; reg [`BANK_LINE_COUNT-1:0] dirty; @@ -40,8 +39,7 @@ module VX_tag_data_store #( assign read_dirty = dirty [read_addr]; assign read_dirtyb = dirtyb [read_addr]; assign read_tag = tag [read_addr]; - assign read_data = data [read_addr]; - + wire do_write = (| write_enable); always @(posedge clk) begin @@ -69,15 +67,26 @@ module VX_tag_data_store #( if (invalidate) begin valid[write_addr] <= 0; end - - for (integer j = 0; j < `BANK_LINE_WORDS; j++) begin - for (integer i = 0; i < WORD_SIZE; i++) begin - if (write_enable[j][i]) begin - data[write_addr][j][i] <= write_data[j * `WORD_WIDTH + i * 8 +: 8]; - end - end - end end end + wire [(`BANK_LINE_WORDS * WORD_SIZE)-1:0] ram_wren; + assign ram_wren = write_enable & {(`BANK_LINE_WORDS * WORD_SIZE){!stall_bank_pipe}}; + + VX_dp_ram #( + .DATAW(`BANK_LINE_WORDS * WORD_SIZE * 8), + .SIZE(`BANK_LINE_COUNT), + .BYTEENW(`BANK_LINE_WORDS * WORD_SIZE), + .BUFFERED(0), + .RWCHECK(1) + ) dp_ram ( + .clk(clk), + .waddr(write_addr), + .raddr(read_addr), + .wren(ram_wren), + .rden(1'b1), + .din(write_data), + .dout(read_data) + ); + endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_dp_ram.v b/hw/rtl/libs/VX_dp_ram.v new file mode 100644 index 00000000..b7d70789 --- /dev/null +++ b/hw/rtl/libs/VX_dp_ram.v @@ -0,0 +1,117 @@ +`include "VX_platform.vh" + +module VX_dp_ram #( + parameter DATAW = 1, + parameter SIZE = 1, + parameter BYTEENW = 1, + parameter BUFFERED = 1, + parameter RWCHECK = 1, + parameter ADDRW = $clog2(SIZE), + parameter SIZEW = $clog2(SIZE+1) +) ( + input wire clk, + input wire [ADDRW-1:0] waddr, + input wire [ADDRW-1:0] raddr, + input wire [BYTEENW-1:0] wren, + input wire rden, + input wire [DATAW-1:0] din, + output wire [DATAW-1:0] dout +); + + if (BUFFERED) begin + + reg [DATAW-1:0] mem [SIZE-1:0]; + reg [DATAW-1:0] dout_r; + + if (BYTEENW > 1) begin + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8]; + end + if (rden) + dout_r <= mem[raddr]; + end + end else begin + always @(posedge clk) begin + if (wren) + mem[waddr] <= din; + if (rden) + dout_r <= mem[raddr]; + end + end + + assign dout = dout_r; + + end else begin + + `UNUSED_VAR(rden) + + if (RWCHECK) begin + + reg [DATAW-1:0] mem [SIZE-1:0]; + + if (BYTEENW > 1) begin + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8]; + end + end + end else begin + always @(posedge clk) begin + if (wren) + mem[waddr] <= din; + end + end + + `ifdef SYNTHESIS + reg [DATAW-1:0] din_r; + wire writing; + + if (BYTEENW > 1) begin + assign writing = (| wren); + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + din_r[i * 8 +: 8] <= wren[i] ? din[i * 8 +: 8] : mem[waddr][i * 8 +: 8]; + end + end + end else begin + assign writing = wren; + always @(posedge clk) begin + din_r <= din; + end + end + + reg bypass_r; + always @(posedge clk) begin + bypass_r <= writing && (raddr == waddr); + end + + assign dout = bypass_r ? din_r : mem[raddr]; + `else + assign dout = mem[raddr]; + `endif + + end else begin + + reg [DATAW-1:0] mem [SIZE-1:0]; + + if (BYTEENW > 1) begin + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8]; + end + end + end else begin + always @(posedge clk) begin + if (wren) + mem[waddr] <= din; + end + end + assign dout = mem[raddr]; + end + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index a14f4ec9..3c5c9a78 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -18,16 +18,11 @@ module VX_generic_queue #( output wire [SIZEW-1:0] size ); `STATIC_ASSERT(`ISPOW2(SIZE), ("must be 0 or power of 2!")) + + if (SIZE == 1) begin - always @(*) begin - assert(!pop || !empty); - assert(!push || !full); - end - - if (SIZE == 1) begin // (SIZE == 1) - - reg [SIZEW-1:0] size_r; reg [DATAW-1:0] head_r; + reg size_r; always @(posedge clk) begin if (reset) begin @@ -35,8 +30,10 @@ module VX_generic_queue #( size_r <= 0; end else begin if (push && !pop) begin + assert(!full); size_r <= 1; end else if (pop && !push) begin + assert(!empty); size_r <= 0; end if (push) begin @@ -50,63 +47,14 @@ module VX_generic_queue #( assign full = (size_r != 0); assign size = size_r; - end else begin // (SIZE > 1) - - `ifdef QUARTUS - - scfifo scfifo_component ( - .clock (clk), - .data (data_in), - .rdreq (pop), - .wrreq (push), - .empty (empty), - .full (full), - .q (data_out), - .sclr (reset), - .usedw (), - .aclr (), - .almost_empty (), - .almost_full (), - .eccstatus () - ); - - defparam - scfifo_component.lpm_type = "scfifo", - scfifo_component.intended_device_family = "Arria 10", - scfifo_component.lpm_numwords = SIZE, - scfifo_component.lpm_width = DATAW, - scfifo_component.lpm_widthu = $clog2(SIZE), - scfifo_component.lpm_showahead = "ON", - scfifo_component.add_ram_output_register = (BUFFERED ? "ON" : "ON"), - scfifo_component.use_eab = "ON"; - - reg [SIZEW-1:0] size_r; - - always @(posedge clk) begin - if (reset) begin - size_r <= 0; - end else begin - if (push && !pop) begin - size_r <= size_r + SIZEW'(1); - end - if (pop && !push) begin - size_r <= size_r - SIZEW'(1); - end - end - end - - assign size = size_r; - - `else - - `USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0]; + end else begin if (0 == BUFFERED) begin - reg [SIZEW-1:0] size_r; reg [ADDRW:0] rd_ptr_r; reg [ADDRW:0] wr_ptr_r; - + reg [ADDRW-1:0] used_r; + wire [ADDRW-1:0] rd_ptr_a = rd_ptr_r[ADDRW-1:0]; wire [ADDRW-1:0] wr_ptr_a = wr_ptr_r[ADDRW-1:0]; @@ -114,111 +62,127 @@ module VX_generic_queue #( if (reset) begin rd_ptr_r <= 0; wr_ptr_r <= 0; - size_r <= 0; + used_r <= 0; end else begin - if (push) begin + if (push) begin + assert(!full); wr_ptr_r <= wr_ptr_r + (ADDRW+1)'(1); if (!pop) begin - size_r <= size_r + SIZEW'(1); + used_r <= used_r + ADDRW'(1); end end if (pop) begin + assert(!empty); rd_ptr_r <= rd_ptr_r + (ADDRW+1)'(1); if (!push) begin - size_r <= size_r - SIZEW'(1); + used_r <= used_r - ADDRW'(1); end end end end - always @(posedge clk) begin - if (push) begin - data[wr_ptr_a] <= data_in; - end - end - - assign data_out = data[rd_ptr_a]; - assign empty = (wr_ptr_r == rd_ptr_r); - assign full = (wr_ptr_a == rd_ptr_a) && (wr_ptr_r[ADDRW] != rd_ptr_r[ADDRW]); - assign size = size_r; + VX_dp_ram #( + .DATAW(DATAW), + .SIZE(SIZE), + .BUFFERED(0), + .RWCHECK(1) + ) dp_ram ( + .clk(clk), + .waddr(wr_ptr_a), + .raddr(rd_ptr_a), + .wren(push), + .rden(pop), + .din(data_in), + .dout(data_out) + ); + + assign empty = (wr_ptr_r == rd_ptr_r); + assign full = (wr_ptr_a == rd_ptr_a) && (wr_ptr_r[ADDRW] != rd_ptr_r[ADDRW]); + assign size = {full, used_r}; end else begin - reg [SIZEW-1:0] size_r; - reg [DATAW-1:0] head_r; - reg [DATAW-1:0] curr_r; + wire [DATAW-1:0] dout; + + reg [DATAW-1:0] din_r; reg [ADDRW-1:0] wr_ptr_r; reg [ADDRW-1:0] rd_ptr_r; - reg [ADDRW-1:0] rd_ptr_next_r; + reg [ADDRW-1:0] rd_ptr_n_r; + reg [ADDRW-1:0] used_r; reg empty_r; reg full_r; reg bypass_r; always @(posedge clk) begin - if (reset) begin - size_r <= 0; - curr_r <= 0; - wr_ptr_r <= 0; - rd_ptr_r <= 0; - rd_ptr_next_r <= 1; - empty_r <= 1; - full_r <= 0; + if (reset) begin + wr_ptr_r <= 0; + rd_ptr_r <= 0; + rd_ptr_n_r <= 1; + empty_r <= 1; + full_r <= 0; + used_r <= 0; end else begin if (push) begin wr_ptr_r <= wr_ptr_r + ADDRW'(1); if (!pop) begin empty_r <= 0; - if (size_r == SIZEW'(SIZE-1)) begin + if (used_r == ADDRW'(SIZE-1)) begin full_r <= 1; end - size_r <= size_r + SIZEW'(1); + used_r <= used_r + ADDRW'(1); end end if (pop) begin - rd_ptr_r <= rd_ptr_next_r; + rd_ptr_r <= rd_ptr_n_r; if (SIZE > 2) begin - rd_ptr_next_r <= rd_ptr_r + ADDRW'(2); + rd_ptr_n_r <= rd_ptr_r + ADDRW'(2); end else begin // (SIZE == 2); - rd_ptr_next_r <= ~rd_ptr_next_r; + rd_ptr_n_r <= ~rd_ptr_n_r; end - if (!push) begin - if (size_r == SIZEW'(1)) begin - assert(rd_ptr_next_r == wr_ptr_r); + if (!push) begin + full_r <= 0; + if (used_r == ADDRW'(1)) begin + assert(rd_ptr_n_r == wr_ptr_r); empty_r <= 1; - end; - full_r <= 0; - size_r <= size_r - SIZEW'(1); + end; + used_r <= used_r - ADDRW'(1); end end - - bypass_r <= push && (empty_r || ((size_r == SIZEW'(1)) && pop)); - curr_r <= data_in; end end always @(posedge clk) begin - if (reset) begin - head_r <= 0; - end else begin - if (push) begin - data[wr_ptr_r] <= data_in; - end - head_r <= data[pop ? rd_ptr_next_r : rd_ptr_r]; - end - end + if (push && (empty_r || ((used_r == ADDRW'(1)) && pop))) begin + bypass_r <= 1; + din_r <= data_in; + end else if (pop) + bypass_r <= 0; + end - assign data_out = bypass_r ? curr_r : head_r; + VX_dp_ram #( + .DATAW(DATAW), + .SIZE(SIZE), + .BUFFERED(1), + .RWCHECK(0) + ) dp_ram ( + .clk(clk), + .waddr(wr_ptr_r), + .raddr(rd_ptr_n_r), + .wren(push), + .rden(pop), + .din(data_in), + .dout(dout) + ); + + assign data_out = bypass_r ? din_r : dout; assign empty = empty_r; assign full = full_r; - assign size = size_r; + assign size = {full_r, used_r}; end - - `endif - end endmodule diff --git a/hw/syn/quartus/project.tcl b/hw/syn/quartus/project.tcl index 93102ec5..26d41900 100644 --- a/hw/syn/quartus/project.tcl +++ b/hw/syn/quartus/project.tcl @@ -43,17 +43,19 @@ set_global_assignment -name VERILOG_MACRO FPU_FAST set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0 set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100 set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)" -set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" -set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON -set_global_assignment -name FITTER_EFFORT "STANDARD FIT" set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON -set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON set_global_assignment -name POWER_USE_TA_VALUE 65 set_global_assignment -name SEED 1 +set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON +set_global_assignment -name FITTER_EFFORT "STANDARD FIT" +set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" +set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED +set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM +set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" set idx 0 foreach arg $q_args_orig {