diff --git a/benchmarks/new_opencl/bfs/Makefile b/benchmarks/new_opencl/bfs/Makefile index 9bf33d90..0eb063ae 100644 --- a/benchmarks/new_opencl/bfs/Makefile +++ b/benchmarks/new_opencl/bfs/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-error CXXFLAGS += -I$(POCLRT_PATH)/include -LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/dummy -lOpenCL -lvortex PROJECT = bfs @@ -25,7 +25,7 @@ run-fpga: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so b/benchmarks/new_opencl/compiler/lib/libOpenCL.so deleted file mode 100644 index 70df4b33..00000000 Binary files a/benchmarks/new_opencl/compiler/lib/libOpenCL.so and /dev/null differ diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so b/benchmarks/new_opencl/compiler/lib/libOpenCL.so new file mode 120000 index 00000000..a2c38614 --- /dev/null +++ b/benchmarks/new_opencl/compiler/lib/libOpenCL.so @@ -0,0 +1 @@ +libOpenCL.so.2 \ No newline at end of file diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 deleted file mode 100644 index 70df4b33..00000000 Binary files a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 and /dev/null differ diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 new file mode 120000 index 00000000..e03f1782 --- /dev/null +++ b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 @@ -0,0 +1 @@ +libOpenCL.so.2.5.0 \ No newline at end of file diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2.5.0 b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2.5.0 index 70df4b33..10ff7140 100644 Binary files a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2.5.0 and b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2.5.0 differ diff --git a/benchmarks/new_opencl/guassian/Makefile b/benchmarks/new_opencl/guassian/Makefile index 0bec2a95..0b73b9aa 100644 --- a/benchmarks/new_opencl/guassian/Makefile +++ b/benchmarks/new_opencl/guassian/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-error CXXFLAGS += -I$(POCLRT_PATH)/include -LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/dummy -lOpenCL -lvortex PROJECT = guassian @@ -25,7 +25,7 @@ run-fpga: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/benchmarks/new_opencl/kmeans/Makefile b/benchmarks/new_opencl/kmeans/Makefile index 267bbf68..85d1b36c 100644 --- a/benchmarks/new_opencl/kmeans/Makefile +++ b/benchmarks/new_opencl/kmeans/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-error CXXFLAGS += -I$(POCLRT_PATH)/include -LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/dummy -lOpenCL -lvortex PROJECT = kmeans @@ -25,7 +25,7 @@ run-fpga: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/benchmarks/new_opencl/nearn/Makefile b/benchmarks/new_opencl/nearn/Makefile index 0dcc3c01..d767a48d 100644 --- a/benchmarks/new_opencl/nearn/Makefile +++ b/benchmarks/new_opencl/nearn/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-error CXXFLAGS += -I$(POCLRT_PATH)/include -LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/dummy -lOpenCL -lvortex PROJECT = nearn @@ -25,7 +25,7 @@ run-fpga: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/benchmarks/new_opencl/saxpy/Makefile b/benchmarks/new_opencl/saxpy/Makefile index 329c64d6..c7135458 100644 --- a/benchmarks/new_opencl/saxpy/Makefile +++ b/benchmarks/new_opencl/saxpy/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-error CXXFLAGS += -I$(POCLRT_PATH)/include -LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/dummy -lOpenCL -lvortex PROJECT = saxpy @@ -25,7 +25,7 @@ run-fpga: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/benchmarks/new_opencl/sfilter/Makefile b/benchmarks/new_opencl/sfilter/Makefile index d35fad20..5fd0a084 100644 --- a/benchmarks/new_opencl/sfilter/Makefile +++ b/benchmarks/new_opencl/sfilter/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-error CXXFLAGS += -I$(POCLRT_PATH)/include -LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/dummy -lOpenCL -lvortex PROJECT = sfilter @@ -25,7 +25,7 @@ run-fpga: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/benchmarks/new_opencl/sgemm/Makefile b/benchmarks/new_opencl/sgemm/Makefile index 5b607a80..e928eca9 100644 --- a/benchmarks/new_opencl/sgemm/Makefile +++ b/benchmarks/new_opencl/sgemm/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-error CXXFLAGS += -I$(POCLRT_PATH)/include -LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/dummy -lOpenCL -lvortex PROJECT = sgemm @@ -25,7 +25,7 @@ run-fpga: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/benchmarks/new_opencl/vecadd/Makefile b/benchmarks/new_opencl/vecadd/Makefile index 0cd03fae..0bd5cd86 100644 --- a/benchmarks/new_opencl/vecadd/Makefile +++ b/benchmarks/new_opencl/vecadd/Makefile @@ -7,7 +7,7 @@ CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-error CXXFLAGS += -I$(POCLRT_PATH)/include -LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/dummy -lOpenCL -lvortex PROJECT = vecadd @@ -25,7 +25,7 @@ run-fpga: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/benchmarks/new_opencl/vecadd/kernel.pocl b/benchmarks/new_opencl/vecadd/kernel.pocl index a010a9fc..2dab3524 100644 Binary files a/benchmarks/new_opencl/vecadd/kernel.pocl and b/benchmarks/new_opencl/vecadd/kernel.pocl differ diff --git a/driver/hw/Makefile b/driver/hw/Makefile index a5333bb0..7ac422aa 100644 --- a/driver/hw/Makefile +++ b/driver/hw/Makefile @@ -1,35 +1,36 @@ -BUILD_DIR=build_ase +ASE_BUILD_DIR=build_ase +FPGA_BUILD_DIR=build_fpga all: ase fpga ase: setup-ase - make -C $(BUILD_DIR) + make -C $(ASE_BUILD_DIR) fpga: setup-fpga - cd build_fpga && qsub-synth + cd $(FPGA_BUILD_DIR) && qsub-synth -setup-ase: build_ase/Makefile +setup-ase: $(ASE_BUILD_DIR)/Makefile -setup-fpga: build_fpga/build/dcp.qpf +setup-fpga: $(FPGA_BUILD_DIR)/build/dcp.qpf -build_ase/Makefile: - afu_sim_setup --s sources.txt build_ase +$(ASE_BUILD_DIR)/Makefile: + afu_sim_setup --s sources.txt $(ASE_BUILD_DIR) -build_fpga/build/dcp.qpf: - afu_synth_setup -s sources.txt build_fpga +$(FPGA_BUILD_DIR)/build/dcp.qpf: + afu_synth_setup -s sources.txt $(FPGA_BUILD_DIR) run-ase: - cd build_ase && make sim + cd $(ASE_BUILD_DIR) && make sim wave: - vsim -view build_ase/work/vsim.wlf -do wave.do + vsim -view $(ASE_BUILD_DIR)/work/vsim.wlf -do wave.do run-fpga: # TODO clean-ase: - rm -rf build_ase + rm -rf $(ASE_BUILD_DIR) clean-fpga: - rm -rf build_fpga \ No newline at end of file + rm -rf $(FPGA_BUILD_DIR) \ No newline at end of file diff --git a/driver/hw/sources.txt b/driver/hw/sources.txt index be07bef5..26201f2d 100644 --- a/driver/hw/sources.txt +++ b/driver/hw/sources.txt @@ -68,6 +68,7 @@ vortex_afu.json ../../rtl/VX_cache/VX_cache_miss_resrv.v ../../rtl/VX_cache/VX_fill_invalidator.v ../../rtl/VX_cache/VX_tag_data_structure.v +../../rtl/VX_cache/VX_prefetcher.v ../../rtl/cache/VX_generic_pe.v ../../rtl/cache/cache_set.v ../../rtl/cache/VX_d_cache.v diff --git a/driver/hw/vortex_afu.json b/driver/hw/vortex_afu.json index c8adc2e0..d42414fd 100644 --- a/driver/hw/vortex_afu.json +++ b/driver/hw/vortex_afu.json @@ -14,7 +14,7 @@ "cmd-type-read": 1, "cmd-type-write": 2, "cmd-type-run": 3, - "cmd-type-snoop": 4, + "cmd-type-clflush": 4, "afu-top-interface": { diff --git a/driver/hw/vortex_afu.sv b/driver/hw/vortex_afu.sv index c24dab2f..f915bc5f 100644 --- a/driver/hw/vortex_afu.sv +++ b/driver/hw/vortex_afu.sv @@ -34,7 +34,9 @@ module vortex_afu #( ); localparam AVS_RD_QUEUE_SIZE = 16; -localparam VX_SNOOPING_DELAY = 300; + +localparam VX_SNOOP_DELAY = 300; +localparam VX_SNOOP_LEVELS = 2; localparam AFU_ID_L = 16'h0002; // AFU ID Lower localparam AFU_ID_H = 16'h0004; // AFU ID Higher @@ -42,7 +44,7 @@ localparam AFU_ID_H = 16'h0004; // AFU ID Higher localparam CMD_TYPE_READ = `AFU_IMAGE_CMD_TYPE_READ; localparam CMD_TYPE_WRITE = `AFU_IMAGE_CMD_TYPE_WRITE; localparam CMD_TYPE_RUN = `AFU_IMAGE_CMD_TYPE_RUN; -localparam CMD_TYPE_SNOOP = `AFU_IMAGE_CMD_TYPE_SNOOP; +localparam CMD_TYPE_CLFLUSH = `AFU_IMAGE_CMD_TYPE_CLFLUSH; localparam MMIO_CSR_CMD = `AFU_IMAGE_MMIO_CSR_CMD; localparam MMIO_CSR_STATUS = `AFU_IMAGE_MMIO_CSR_STATUS; @@ -52,13 +54,12 @@ localparam MMIO_CSR_DATA_SIZE = `AFU_IMAGE_MMIO_CSR_DATA_SIZE; logic [127:0] afu_id = `AFU_ACCEL_UUID; -typedef enum logic[2:0] { +typedef enum logic[3:0] { STATE_IDLE, STATE_READ, STATE_WRITE, STATE_RUN, - STATE_SNOOP1, - STATE_SNOOP2 + STATE_CLFLUSH } state_t; state_t state; @@ -192,7 +193,7 @@ logic [31:0] cci_write_ctr; logic [31:0] avs_read_ctr; logic [31:0] avs_write_ctr; logic [31:0] vx_snoop_ctr; -logic [31:0] vx_snoop_delay; +logic [9:0] vx_snoop_delay; logic vx_reset; always_ff @(posedge clk) @@ -210,21 +211,21 @@ begin STATE_IDLE: begin case (csr_cmd) CMD_TYPE_READ: begin - $display("%t: CMD READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); state <= STATE_READ; end CMD_TYPE_WRITE: begin - $display("%t: CMD WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); state <= STATE_WRITE; end CMD_TYPE_RUN: begin - $display("%t: CMD START", $time); + $display("%t: STATE START", $time); vx_reset <= 1; state <= STATE_RUN; end - CMD_TYPE_SNOOP: begin - $display("%t: CMD SNOOP: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size); - state <= STATE_SNOOP1; + CMD_TYPE_CLFLUSH: begin + $display("%t: STATE CFLUSH: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size); + state <= STATE_CLFLUSH; end endcase end @@ -250,15 +251,8 @@ begin end end - STATE_SNOOP1: begin - if (vx_snoop_delay >= VX_SNOOPING_DELAY) - begin - state <= STATE_SNOOP2; - end - end - - STATE_SNOOP2: begin - if (vx_snoop_delay >= VX_SNOOPING_DELAY) + STATE_CLFLUSH: begin + if (vx_snoop_delay >= VX_SNOOP_DELAY) begin state <= STATE_IDLE; end @@ -320,7 +314,7 @@ begin end end - STATE_RUN: begin + STATE_RUN, STATE_CLFLUSH: begin if (vx_dram_req_read && !vx_dram_req_delay) begin @@ -348,15 +342,20 @@ begin end // Vortex DRAM requests stalling -assign vx_dram_req_delay = !((STATE_RUN == state) - && !avs_waitrequest - && !avs_raq_full - && !avs_rdq_full); -// Vortex DRAM fill response +logic vortex_enabled; + always_comb begin - vx_dram_fill_rsp = (STATE_RUN == state) && !avs_rdq_empty && vx_dram_fill_accept; + vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state); + vx_dram_req_delay = !vortex_enabled || avs_waitrequest || avs_raq_full || avs_rdq_full; +end + +// Vortex DRAM fill response + +always_comb +begin + vx_dram_fill_rsp = vortex_enabled && !avs_rdq_empty && vx_dram_fill_accept; vx_dram_fill_rsp_addr = (avs_raq_dout << 6); {>>{vx_dram_fill_rsp_data}} = avs_rdq_dout; end @@ -524,32 +523,25 @@ begin else begin if (STATE_IDLE == state) begin - vx_snoop_ctr <= 0; + vx_snoop_ctr <= 0; vx_snoop_delay <= 0; end vx_snp_req <= 0; - if ((STATE_SNOOP1 == state - || STATE_SNOOP2 == state) + if ((STATE_CLFLUSH == state) && vx_snoop_ctr < csr_data_size - && vx_snp_req_delay) + && !vx_snp_req_delay) begin - vx_snp_req <= 1; - vx_snoop_ctr <= vx_snoop_ctr + 1; + vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6; + vx_snp_req <= 1; + vx_snoop_ctr <= vx_snoop_ctr + 1; end - if ((vx_snoop_ctr >= csr_data_size) - && (vx_snoop_delay < VX_SNOOPING_DELAY)) + if (vx_snoop_ctr == csr_data_size) begin vx_snoop_delay <= vx_snoop_delay + 1; end - - if (vx_snoop_delay >= VX_SNOOPING_DELAY) - begin - vx_snoop_ctr <= 0; - vx_snoop_delay <= 0; - end end end diff --git a/driver/hw/wave.do b/driver/hw/wave.do index 58548f72..aab97878 100644 --- a/driver/hw/wave.do +++ b/driver/hw/wave.do @@ -27,12 +27,17 @@ add wave -noupdate -label avs_raq_full /ase_top/ase_top_generic/platform_shim_cc add wave -noupdate -label avs_rdq_full /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_full add wave -noupdate -label avs_raq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_raq_empty add wave -noupdate -label avs_rdq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_empty +add wave -noupdate -label vortex_enabled /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vortex_enabled +add wave -noupdate -label vx_reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/reset +add wave -noupdate -label vx_dram_req_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_read add wave -noupdate -label vx_dram_req_write /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_write add wave -noupdate -label vx_dram_req_delay /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_delay -add wave -noupdate -label vx_dram_req_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_read -add wave -noupdate -label vx_reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/reset +add wave -noupdate -label vx_dram_req_addr -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_addr +add wave -noupdate -label vx_draw_req_data -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_data add wave -noupdate -label out_dram_fill_rsp /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_dram_fill_rsp add wave -noupdate -label out_dram_fill_accept /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_dram_fill_accept +add wave -noupdate -label vx_draw_fill_rsp_data -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_fill_rsp_data +add wave -noupdate -label vx_dram_fill_rsp_addr -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_fill_rsp_addr add wave -noupdate -label llc_snp_req /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/llc_snp_req add wave -noupdate -label llc_snp_req_delay /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/llc_snp_req_delay add wave -noupdate -label out_break /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_ebreak @@ -45,7 +50,7 @@ add wave -noupdate -label warp_stalled {/ase_top/ase_top_generic/platform_shim_c add wave -noupdate -label warp_lock {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/warp_lock} add wave -noupdate -label use_active {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/use_active} TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {66234495 ps} 0} +WaveRestoreCursors {{Cursor 2} {360293 ps} 0} quietly wave cursor active 1 configure wave -namecolwidth 195 configure wave -valuecolwidth 100 @@ -61,4 +66,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ps update -WaveRestoreZoom {66041656 ps} {66406344 ps} +WaveRestoreZoom {346453 ps} {711141 ps} diff --git a/driver/sw/opae/Makefile b/driver/sw/opae/Makefile index 08397a99..e67ea0ac 100644 --- a/driver/sw/opae/Makefile +++ b/driver/sw/opae/Makefile @@ -65,4 +65,4 @@ clean: ifneq ($(MAKECMDGOALS),clean) -include .depend -endif \ No newline at end of file +endif diff --git a/driver/sw/opae/vortex.cpp b/driver/sw/opae/vortex.cpp index 11d29ee3..62a80ccd 100755 --- a/driver/sw/opae/vortex.cpp +++ b/driver/sw/opae/vortex.cpp @@ -22,8 +22,8 @@ #define CMD_TYPE_READ AFU_IMAGE_CMD_TYPE_READ #define CMD_TYPE_WRITE AFU_IMAGE_CMD_TYPE_WRITE -#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN -#define CMD_TYPE_SNOOP AFU_IMAGE_CMD_TYPE_SNOOP +#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN +#define CMD_TYPE_CLFLUSH AFU_IMAGE_CMD_TYPE_CLFLUSH #define MMIO_CSR_CMD (AFU_IMAGE_MMIO_CSR_CMD * 4) #define MMIO_CSR_STATUS (AFU_IMAGE_MMIO_CSR_STATUS * 4) @@ -313,7 +313,7 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) { CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_SNOOP)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH)); // Wait for the write operation to finish if (vx_ready_wait(hdevice, -1) != 0) diff --git a/driver/tests/basic/Makefile b/driver/tests/basic/Makefile index d04e756f..786b0491 100644 --- a/driver/tests/basic/Makefile +++ b/driver/tests/basic/Makefile @@ -1,3 +1,21 @@ +RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops) +VX_RT_PATH ?= $(wildcard ../../../runtime) + +VX_CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc +VX_CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ +VX_DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump +VX_CPY = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy + +VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c +VX_STR = $(VX_RT_PATH)/startup/vx_start.S +VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s +VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c +VX_API = $(VX_RT_PATH)/vx_api/vx_api.c +VX_FIO = $(VX_RT_PATH)/fileio/fileio.s + +VX_CFLAGS = -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections + +VX_SRCS = kernel.c CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors @@ -11,6 +29,18 @@ SRCS = basic.cpp all: $(PROJECT) +kernel.dump: kernel.elf + $(VX_DMP) -D kernel.elf > kernel.dump + +kernel.hex: kernel.elf + $(VX_CPY) -O ihex kernel.elf kernel.hex + +kernel.bin: kernel.elf + $(VX_CPY) -O binary kernel.elf kernel.bin + +kernel.elf: $(SRCS) + $(VX_CC) $(VX_CFLAGS) $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_SRCS) -I$(VX_RT_PATH) -o kernel.elf + $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../sw/dummy -lvortex -o $@ @@ -18,7 +48,7 @@ run-fpga: $(PROJECT) LD_LIBRARY_PATH=../../sw/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) - ASE_LOG=0 LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -t 1 + ASE_LOG=0 LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-rtlsim: $(PROJECT) LD_LIBRARY_PATH=../../sw/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/driver/tests/basic/basic b/driver/tests/basic/basic index 1ec3ae80..12020e39 100755 Binary files a/driver/tests/basic/basic and b/driver/tests/basic/basic differ diff --git a/driver/tests/basic/basic.cpp b/driver/tests/basic/basic.cpp index 9e13183b..684d655d 100755 --- a/driver/tests/basic/basic.cpp +++ b/driver/tests/basic/basic.cpp @@ -27,11 +27,11 @@ uint64_t shuffle(int i, uint64_t value) { return (value << i) | (value & ((1 << i)-1));; } -int run_test_0(vx_buffer_h sbuf, - vx_buffer_h dbuf, - uint32_t address, - uint64_t value, - int num_blocks) { +int run_memcopy_test(vx_buffer_h sbuf, + vx_buffer_h dbuf, + uint32_t address, + uint64_t value, + int num_blocks) { int ret; int errors = 0; @@ -73,8 +73,29 @@ int run_test_0(vx_buffer_h sbuf, return 0; } -int run_test_1(vx_device_h device, const char* program) { +int run_kernel_test(vx_device_h device, + vx_buffer_h sbuf, + vx_buffer_h dbuf, + const char* program) { int ret; + int errors = 0; + + uint64_t seed = 0x0badf00d40ff40ff; + int num_blocks = 4; + + unsigned src_dev_addr = 0x10000000; + unsigned dest_dev_addr = 0x20000000; + + // write sbuf data + for (int i = 0; i < 8 * num_blocks; ++i) { + ((uint64_t*)vx_host_ptr(sbuf))[i] = shuffle(i, seed); + } + + // write buffer to local memory + std::cout << "write buffer to local memory" << std::endl; + ret = vx_copy_to_dev(sbuf, src_dev_addr, 64 * num_blocks, 0); + if (ret != 0) + return ret; // upload program std::cout << "upload program" << std::endl; @@ -97,6 +118,37 @@ int run_test_1(vx_device_h device, const char* program) { return ret; } + // flush the caches + std::cout << "flush the caches" << std::endl; + ret = vx_flush_caches(device, dest_dev_addr, 64 * num_blocks); + if (ret != 0) { + return ret; + } + + // read buffer from local memory + std::cout << "read buffer from local memory" << std::endl; + ret = vx_copy_from_dev(dbuf, dest_dev_addr, 64 * num_blocks, 0); + if (ret != 0) + return ret; + + // verify result + std::cout << "verify result" << std::endl; + for (int i = 0; i < 8 * num_blocks; ++i) { + auto curr = ((uint64_t*)vx_host_ptr(dbuf))[i]; + auto ref = shuffle(i, seed); + if (curr != ref) { + std::cout << "error @ " << std::hex << (dest_dev_addr + 64 * i) + << ": actual " << curr << ", expected " << ref << std::endl; + ++errors; + } + } + + if (errors != 0) { + std::cout << "Found " << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + return 0; } @@ -147,27 +199,15 @@ int main(int argc, char *argv[]) { // run tests if (0 == test || -1 == test) { - std::cout << "run test suite 0" << std::endl; + std::cout << "run memcopy test" << std::endl; - ret = run_test_0(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1); + ret = run_memcopy_test(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1); if (ret != 0) { cleanup(); return ret; } - ret = run_test_0(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 2); - if (ret != 0) { - cleanup(); - return ret; - } - - ret = run_test_0(sbuf, dbuf, 0x20000000, 0xff00ff00ff00ff00, 4); - if (ret != 0) { - cleanup(); - return ret; - } - - ret = run_test_0(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8); + ret = run_memcopy_test(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8); if (ret != 0) { cleanup(); return ret; @@ -175,17 +215,8 @@ int main(int argc, char *argv[]) { } if (1 == test || -1 == test) { - std::cout << "run test suite 1" << std::endl; - ret = run_test_1(device, "rv32ui-p-lw.bin"); - if (ret != 0) { - cleanup(); - return ret; - } - } - - if (2 == test || -1 == test) { - std::cout << "run test suite 1" << std::endl; - ret = run_test_1(device, "rv32ui-p-sw.bin"); + std::cout << "run kernel test" << std::endl; + ret = run_kernel_test(device, sbuf, dbuf, "kernel.bin"); if (ret != 0) { cleanup(); return ret; diff --git a/driver/tests/basic/rv32ui-p-sw.bin b/driver/tests/basic/kernel.bin old mode 100644 new mode 100755 similarity index 60% rename from driver/tests/basic/rv32ui-p-sw.bin rename to driver/tests/basic/kernel.bin index 4a3db3b8..55e20d29 Binary files a/driver/tests/basic/rv32ui-p-sw.bin and b/driver/tests/basic/kernel.bin differ diff --git a/driver/tests/basic/kernel.c b/driver/tests/basic/kernel.c new file mode 100644 index 00000000..c79d1056 --- /dev/null +++ b/driver/tests/basic/kernel.c @@ -0,0 +1,9 @@ +#include + +void main() { + int64_t* x = (int64_t*)0x10000000; + int64_t* y = (int64_t*)0x20000000; + for (int i = 0; i < 8 * 4; ++i) { + y[i] = x[i]; + } +} \ No newline at end of file diff --git a/driver/tests/basic/rv32ui-p-lw.bin b/driver/tests/basic/rv32ui-p-lw.bin deleted file mode 100644 index 51c28eaa..00000000 Binary files a/driver/tests/basic/rv32ui-p-lw.bin and /dev/null differ diff --git a/driver/tests/demo/demo b/driver/tests/demo/demo index 22c3b0a9..7227e103 100755 Binary files a/driver/tests/demo/demo and b/driver/tests/demo/demo differ diff --git a/driver/tests/demo/demo.cpp b/driver/tests/demo/demo.cpp index 3e84bbd0..da2f22b7 100644 --- a/driver/tests/demo/demo.cpp +++ b/driver/tests/demo/demo.cpp @@ -80,7 +80,7 @@ int run_test(vx_device_h device, int errors = 0; auto buf_ptr = (int*)vx_host_ptr(buffer); for (uint32_t i = 0; i < num_points; ++i) { - int ref = i * i; + int ref = i + i; int cur = buf_ptr[i]; if (cur != ref) { ++errors; diff --git a/driver/tests/demo/kernel.bin b/driver/tests/demo/kernel.bin index cbc44de8..c13e42ef 100755 Binary files a/driver/tests/demo/kernel.bin and b/driver/tests/demo/kernel.bin differ diff --git a/driver/tests/demo/kernel.c b/driver/tests/demo/kernel.c index 130d6872..226fb673 100644 --- a/driver/tests/demo/kernel.c +++ b/driver/tests/demo/kernel.c @@ -16,7 +16,7 @@ void kernel_body(void* arg) { unsigned i = ((wNo * _arg->num_threads) + tid) * _arg->stride; for (unsigned j = 0; j < _arg->stride; ++j) { - z[i+j] = x[i+j] * y[i+j]; + z[i+j] = x[i+j] + y[i+j]; } } diff --git a/driver/tests/demo/kernel.elf b/driver/tests/demo/kernel.elf new file mode 100755 index 00000000..82732628 Binary files /dev/null and b/driver/tests/demo/kernel.elf differ diff --git a/rtl/VX_alu.v b/rtl/VX_alu.v index 07b090e6..e4346f75 100644 --- a/rtl/VX_alu.v +++ b/rtl/VX_alu.v @@ -135,7 +135,7 @@ module VX_alu( assign upper_immed = {in_upper_immed, {12{1'b0}}}; - always @(in_alu_op or ALU_in1 or ALU_in2) begin + always @(*) begin case(in_alu_op) `ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2); `SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2); @@ -177,7 +177,7 @@ module VX_alu( assign upper_immed = {in_upper_immed, {12{1'b0}}}; - always @(in_alu_op or ALU_in1 or ALU_in2) begin + always @(*) begin case(in_alu_op) `ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2); `SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2); diff --git a/rtl/VX_cache/VX_bank.v b/rtl/VX_cache/VX_bank.v index eb98adce..6b19c99c 100644 --- a/rtl/VX_cache/VX_bank.v +++ b/rtl/VX_cache/VX_bank.v @@ -106,6 +106,15 @@ module VX_bank ); + reg snoop_state = 0; + + always @(posedge clk) begin + if (reset) begin + snoop_state <= 0; + end else begin + snoop_state <= (snoop_state | snp_req) && ((FUNC_ID == `LLFUNC_ID) || (FUNC_ID == `L3FUNC_ID)); + end + end wire snrq_pop; @@ -498,13 +507,17 @@ module VX_bank .out ({is_snp_st2 , fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2 , writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , pc_st2 , inst_meta_st2 }) ); + + wire should_flush; + wire dwbq_push; + wire cwbq_full; wire dwbq_full; wire ffsq_full; wire invalidate_fill; // Enqueue to miss reserv if it's a valid miss - assign miss_add = valid_st2 && !is_snp_st2 && miss_st2 && !mrvq_full && !((is_snp_st2 && valid_st2 && ffsq_full) ||((valid_st2 && !miss_st2) && cwbq_full) || (((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && dwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full)); + assign miss_add = valid_st2 && !is_snp_st2 && miss_st2 && !mrvq_full && !(should_flush && dwbq_push) && !((is_snp_st2 && valid_st2 && ffsq_full) ||((valid_st2 && !miss_st2) && cwbq_full) || (((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && dwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full)); assign miss_add_pc = pc_st2; assign miss_add_addr = addr_st2; assign miss_add_data = writeword_st2; @@ -535,12 +548,23 @@ module VX_bank .full (cwbq_full) ); + assign should_flush = snoop_state && valid_st2 && (miss_add_mem_write != `NO_MEM_WRITE) && !is_snp_st2 && !is_fill_st2; // Enqueue to DWB Queue - wire dwbq_push = ((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && !dwbq_full && !((is_snp_st2 && valid_st2 && ffsq_full) ||((valid_st2 && !miss_st2) && cwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full)); - wire[31:0] dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_ADDR_END:0]} & `BASE_ADDR_MASK; - wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] dwbq_req_data = readdata_st2; + assign dwbq_push = ((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2 || should_flush) && !dwbq_full && !((is_snp_st2 && valid_st2 && ffsq_full) ||((valid_st2 && !miss_st2) && cwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full)); + wire[31:0] dwbq_req_addr; wire dwbq_empty; - + + wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] dwbq_req_data; + if ((FUNC_ID == `LLFUNC_ID) || (FUNC_ID == `L3FUNC_ID)) begin + assign dwbq_req_data = (should_flush && dwbq_push) ? writeword_st2 : readdata_st2; + assign dwbq_req_addr = (should_flush && dwbq_push) ? (addr_st2) : ({readtag_st2, addr_st2[`LINE_SELECT_ADDR_END:0]} & `BASE_ADDR_MASK); + end else begin + assign dwbq_req_data = readdata_st2; + assign dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_ADDR_END:0]} & `BASE_ADDR_MASK; + end + + + wire possible_fill = valid_st2 && miss_st2 && !dram_fill_req_queue_full && !is_snp_st2; wire[31:0] fill_invalidator_addr = addr_st2 & `BASE_ADDR_MASK; VX_fill_invalidator #( diff --git a/rtl/VX_cache/VX_cache_dram_req_arb.v b/rtl/VX_cache/VX_cache_dram_req_arb.v index 0db1f560..37264833 100644 --- a/rtl/VX_cache/VX_cache_dram_req_arb.v +++ b/rtl/VX_cache/VX_cache_dram_req_arb.v @@ -82,6 +82,9 @@ module VX_cache_dram_req_arb wire pref_pop; wire pref_valid; wire[31:0] pref_addr; + + wire dwb_valid; + wire dfqq_req; assign pref_pop = !dwb_valid && !dfqq_req && !dram_req_delay && pref_valid; VX_prefetcher #( @@ -105,10 +108,8 @@ module VX_cache_dram_req_arb ); - wire dfqq_req; wire[31:0] dfqq_req_addr; - wire dfqq_empty; - wire dwb_valid; + wire dfqq_empty; wire dfqq_pop = !dwb_valid && dfqq_req && !dram_req_delay; // If no dwb, and dfqq has valids, then pop wire dfqq_push = (|per_bank_dram_fill_req); @@ -139,8 +140,8 @@ module VX_cache_dram_req_arb assign dram_req = dwb_valid || dfqq_req || pref_pop; - assign dram_req_write = dwb_valid; - assign dram_req_read = (dfqq_req && !dwb_valid) || pref_pop; + assign dram_req_write = dwb_valid && dram_req; + assign dram_req_read = ((dfqq_req && !dwb_valid) || pref_pop) && dram_req; assign dram_req_addr = (dwb_valid ? per_bank_dram_wb_req_addr[dwb_bank] : (dfqq_req ? dfqq_req_addr : pref_addr)) & `BASE_ADDR_MASK; assign dram_req_size = BANK_LINE_SIZE_BYTES; assign {dram_req_data} = dwb_valid ? {per_bank_dram_wb_req_data[dwb_bank] }: 0; diff --git a/rtl/VX_cache/VX_cache_wb_sel_merge.v b/rtl/VX_cache/VX_cache_wb_sel_merge.v index 148b443b..12cd04c7 100644 --- a/rtl/VX_cache/VX_cache_wb_sel_merge.v +++ b/rtl/VX_cache/VX_cache_wb_sel_merge.v @@ -105,15 +105,34 @@ module VX_cache_wb_sel_merge core_wb_pc = 0; core_wb_address = 0; for (this_bank = 0; this_bank < NUMBER_BANKS; this_bank = this_bank + 1) begin - if (((FUNC_ID == `LLFUNC_ID) && found_bank && per_bank_wb_valid[this_bank] && ((this_bank == main_bank_index) || (per_bank_wb_tid[this_bank] != per_bank_wb_tid[main_bank_index]))) || ((FUNC_ID != `LLFUNC_ID) && ((this_bank == main_bank_index) || (per_bank_wb_tid[this_bank] != per_bank_wb_tid[main_bank_index])) && found_bank && (per_bank_wb_valid[this_bank]) && (per_bank_wb_rd[this_bank] == per_bank_wb_rd[main_bank_index]) && (per_bank_wb_warp_num[this_bank] == per_bank_wb_warp_num[main_bank_index]))) begin - core_wb_valid[per_bank_wb_tid[this_bank]] = 1; - core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank]; - core_wb_pc[per_bank_wb_tid[this_bank]] = per_bank_wb_pc[this_bank]; - core_wb_address[per_bank_wb_tid[this_bank]] = per_bank_wb_address[this_bank]; - per_bank_wb_pop_unqual[this_bank] = 1; + if ((FUNC_ID == `LLFUNC_ID) || (FUNC_ID == `L3FUNC_ID)) begin + + if (found_bank && !core_wb_valid[per_bank_wb_tid[this_bank]] && per_bank_wb_valid[this_bank] && ((this_bank == main_bank_index) || (per_bank_wb_tid[this_bank] != per_bank_wb_tid[main_bank_index]))) begin + core_wb_valid[per_bank_wb_tid[this_bank]] = 1; + core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank]; + core_wb_pc[per_bank_wb_tid[this_bank]] = per_bank_wb_pc[this_bank]; + core_wb_address[per_bank_wb_tid[this_bank]] = per_bank_wb_address[this_bank]; + per_bank_wb_pop_unqual[this_bank] = 1; + end else begin + per_bank_wb_pop_unqual[this_bank] = 0; + end + end else begin - per_bank_wb_pop_unqual[this_bank] = 0; + + + if (((this_bank == main_bank_index) || (per_bank_wb_tid[this_bank] != per_bank_wb_tid[main_bank_index])) && found_bank && !core_wb_valid[per_bank_wb_tid[this_bank]] && (per_bank_wb_valid[this_bank]) && (per_bank_wb_rd[this_bank] == per_bank_wb_rd[main_bank_index]) && (per_bank_wb_warp_num[this_bank] == per_bank_wb_warp_num[main_bank_index])) begin + core_wb_valid[per_bank_wb_tid[this_bank]] = 1; + core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank]; + core_wb_pc[per_bank_wb_tid[this_bank]] = per_bank_wb_pc[this_bank]; + core_wb_address[per_bank_wb_tid[this_bank]] = per_bank_wb_address[this_bank]; + per_bank_wb_pop_unqual[this_bank] = 1; + end else begin + per_bank_wb_pop_unqual[this_bank] = 0; + + end + end + end end endgenerate diff --git a/rtl/VX_cache/VX_tag_data_access.v b/rtl/VX_cache/VX_tag_data_access.v index 39c54ba1..f66cfb85 100644 --- a/rtl/VX_cache/VX_tag_data_access.v +++ b/rtl/VX_cache/VX_tag_data_access.v @@ -290,6 +290,6 @@ module VX_tag_data_access assign readtag_st1e = use_read_tag_st1e; assign fill_sent = miss_st1e; assign fill_saw_dirty_st1e = real_writefill && dirty_st1e; - assign invalidate_line = is_snp_st1e && miss_st1e; + assign invalidate_line = snoop_hit; endmodule \ No newline at end of file diff --git a/rtl/VX_cache/VX_tag_data_structure.v b/rtl/VX_cache/VX_tag_data_structure.v index 7ceffcc0..1cf9b7a9 100644 --- a/rtl/VX_cache/VX_tag_data_structure.v +++ b/rtl/VX_cache/VX_tag_data_structure.v @@ -88,9 +88,9 @@ module VX_tag_data_structure if (reset) begin for (l = 0; l < `BANK_LINE_COUNT; l=l+1) begin valid[l] <= 0; - tag [l] <= 0; + // tag [l] <= 0; dirty[l] <= 0; - data [l] <= 0; + // data [l] <= 0; end end else if (!stall_bank_pipe) begin if (going_to_write) begin diff --git a/rtl/VX_define.v b/rtl/VX_define.v index 870a5414..df54f8b0 100644 --- a/rtl/VX_define.v +++ b/rtl/VX_define.v @@ -253,6 +253,15 @@ `define DFFSQ_SIZE 32 `endif +// Prefetcher +`ifndef DPRFQ_SIZE +`define DPRFQ_SIZE 32 +`endif + +`ifndef DPRFQ_STRIDE +`define DPRFQ_STRIDE 0 +`endif + // Fill Invalidator Size {Fill invalidator must be active} `ifndef DFILL_INVALIDAOR_SIZE `define DFILL_INVALIDAOR_SIZE 32 @@ -361,6 +370,15 @@ `define IFFSQ_SIZE 8 `endif +// Prefetcher +`ifndef IPRFQ_SIZE +`define IPRFQ_SIZE 32 +`endif + +`ifndef IPRFQ_STRIDE +`define IPRFQ_STRIDE 0 +`endif + // Fill Invalidator Size {Fill invalidator must be active} `ifndef IFILL_INVALIDAOR_SIZE `define IFILL_INVALIDAOR_SIZE 32 @@ -467,6 +485,15 @@ `define SFFSQ_SIZE 16 `endif +// Prefetcher +`ifndef SPRFQ_SIZE +`define SPRFQ_SIZE 4 +`endif + +`ifndef SPRFQ_STRIDE +`define SPRFQ_STRIDE 0 +`endif + // Fill Invalidator Size {Fill invalidator must be active} `ifndef SFILL_INVALIDAOR_SIZE `define SFILL_INVALIDAOR_SIZE 32 @@ -572,6 +599,15 @@ `define LLFFSQ_SIZE 32 `endif +// Prefetcher +`ifndef LLPRFQ_SIZE +`define LLPRFQ_SIZE 32 +`endif + +`ifndef LLPRFQ_STRIDE +`define LLPRFQ_STRIDE 0 +`endif + // Fill Invalidator Size {Fill invalidator must be active} `ifndef LLFILL_INVALIDAOR_SIZE `define LLFILL_INVALIDAOR_SIZE 32 @@ -677,6 +713,15 @@ `define L3FFSQ_SIZE 8 `endif +// Prefetcher +`ifndef L3PRFQ_SIZE +`define L3PRFQ_SIZE 32 +`endif + +`ifndef L3PRFQ_STRIDE +`define L3PRFQ_STRIDE 0 +`endif + // Fill Invalidator Size {Fill invalidator must be active} `ifndef L3FILL_INVALIDAOR_SIZE `define L3FILL_INVALIDAOR_SIZE 32 diff --git a/rtl/VX_dmem_controller.v b/rtl/VX_dmem_controller.v index c8f7b761..596da8cd 100644 --- a/rtl/VX_dmem_controller.v +++ b/rtl/VX_dmem_controller.v @@ -95,6 +95,8 @@ module VX_dmem_controller ( .DFQQ_SIZE (`SDFQQ_SIZE), .LLVQ_SIZE (`SLLVQ_SIZE), .FFSQ_SIZE (`SFFSQ_SIZE), + .PRFQ_SIZE (`SPRFQ_SIZE), + .PRFQ_STRIDE (`SPRFQ_STRIDE), .FILL_INVALIDAOR_SIZE (`SFILL_INVALIDAOR_SIZE), .SIMULATED_DRAM_LATENCY_CYCLES(`SSIMULATED_DRAM_LATENCY_CYCLES) ) @@ -153,6 +155,7 @@ module VX_dmem_controller ( // Snoop Request .snp_req (0), .snp_req_addr (0), + .snp_req_delay (), // Snoop Forward .snp_fwd (), @@ -177,6 +180,8 @@ module VX_dmem_controller ( .DFQQ_SIZE (`DDFQQ_SIZE), .LLVQ_SIZE (`DLLVQ_SIZE), .FFSQ_SIZE (`DFFSQ_SIZE), + .PRFQ_SIZE (`DPRFQ_SIZE), + .PRFQ_STRIDE (`DPRFQ_STRIDE), .FILL_INVALIDAOR_SIZE (`DFILL_INVALIDAOR_SIZE), .SIMULATED_DRAM_LATENCY_CYCLES(`DSIMULATED_DRAM_LATENCY_CYCLES) ) @@ -263,6 +268,8 @@ module VX_dmem_controller ( .DFQQ_SIZE (`IDFQQ_SIZE), .LLVQ_SIZE (`ILLVQ_SIZE), .FFSQ_SIZE (`IFFSQ_SIZE), + .PRFQ_SIZE (`IPRFQ_SIZE), + .PRFQ_STRIDE (`IPRFQ_STRIDE), .FILL_INVALIDAOR_SIZE (`IFILL_INVALIDAOR_SIZE), .SIMULATED_DRAM_LATENCY_CYCLES(`ISIMULATED_DRAM_LATENCY_CYCLES) ) diff --git a/rtl/VX_generic_queue_ll.v b/rtl/VX_generic_queue_ll.v index 75215486..5349f649 100644 --- a/rtl/VX_generic_queue_ll.v +++ b/rtl/VX_generic_queue_ll.v @@ -6,45 +6,47 @@ module VX_generic_queue_ll parameter SIZE = 277 ) ( - input wire clk, - input wire reset, - input wire push, - input wire[DATAW-1:0] in_data, + input wire clk, + input wire reset, + input wire push, + input wire [DATAW-1:0] in_data, - input wire pop, - output wire[DATAW-1:0] out_data, - output wire empty, - output wire full -); + input wire pop, + output wire [DATAW-1:0] out_data, + output wire empty, + output wire full +); /* verilator lint_off WIDTH */ if (SIZE == 0) begin + assign empty = 1; assign out_data = 0; assign full = 0; - end else begin - `ifdef QUEUE_FORCE_MLAB + end else begin // (SIZE > 0) + + `ifdef QUEUE_FORCE_MLAB (* syn_ramstyle = "mlab" *) reg[DATAW-1:0] data[SIZE-1:0]; - `else - reg[DATAW-1:0] data[SIZE-1:0]; - `endif + `else + reg[ DATAW-1:0] data[SIZE-1:0]; + `endif - reg[DATAW-1:0] curr_r, head_r; - reg[$clog2(SIZE+1)-1:0] size_r; - reg[$clog2(SIZE)-1:0] wr_ctr_r; - reg[$clog2(SIZE)-1:0] rd_ptr_r, rd_next_ptr_r; - reg empty_r, full_r, bypass_r; - wire reading, writing; + reg [DATAW-1:0] head_r; + reg [$clog2(SIZE+1)-1:0] size_r; + wire reading; + wire writing; assign reading = pop && !empty; assign writing = push && !full; if (SIZE == 1) begin + always @(posedge clk) begin if (reset) begin - size_r <= 0; + size_r <= 0; + head_r <= 0; end else begin if (writing && !reading) begin size_r <= 1; @@ -59,9 +61,19 @@ module VX_generic_queue_ll end assign out_data = head_r; - assign empty = (size_r == 0); - assign full = (size_r != 0) && !pop; - end else begin + assign empty = (size_r == 0); + assign full = (size_r != 0) && !pop; + + end else begin // (SIZE > 1) + + reg [DATAW-1:0] curr_r; + reg [$clog2(SIZE)-1:0] wr_ctr_r; + reg [$clog2(SIZE)-1:0] rd_ptr_r; + reg [$clog2(SIZE)-1:0] rd_next_ptr_r; + reg empty_r; + reg full_r; + reg bypass_r; + always @(posedge clk) begin if (reset) begin wr_ctr_r <= 0; @@ -99,9 +111,10 @@ module VX_generic_queue_ll always @(posedge clk) begin if (reset) begin - rd_ptr_r <= 0; + curr_r <= 0; + rd_ptr_r <= 0; rd_next_ptr_r <= 1; - bypass_r <= 0; + bypass_r <= 0; end else begin if (reading) begin if (SIZE == 2) begin @@ -123,7 +136,6 @@ module VX_generic_queue_ll assign empty = empty_r; assign full = full_r; end - end /* verilator lint_on WIDTH */ diff --git a/rtl/Vortex_Cluster.v b/rtl/Vortex_Cluster.v index 08aeb7ea..5b78b8cd 100644 --- a/rtl/Vortex_Cluster.v +++ b/rtl/Vortex_Cluster.v @@ -230,6 +230,8 @@ module Vortex_Cluster .DFQQ_SIZE (`LLDFQQ_SIZE), .LLVQ_SIZE (`LLLLVQ_SIZE), .FFSQ_SIZE (`LLFFSQ_SIZE), + .PRFQ_SIZE (`LLPRFQ_SIZE), + .PRFQ_STRIDE (`LLPRFQ_STRIDE), .FILL_INVALIDAOR_SIZE (`LLFILL_INVALIDAOR_SIZE), .SIMULATED_DRAM_LATENCY_CYCLES(`LLSIMULATED_DRAM_LATENCY_CYCLES) ) diff --git a/rtl/Vortex_SOC.v b/rtl/Vortex_SOC.v index 718828a0..7dc5e6de 100644 --- a/rtl/Vortex_SOC.v +++ b/rtl/Vortex_SOC.v @@ -234,6 +234,8 @@ module Vortex_SOC ( .DFQQ_SIZE (`L3DFQQ_SIZE), .LLVQ_SIZE (`L3LLVQ_SIZE), .FFSQ_SIZE (`L3FFSQ_SIZE), + .PRFQ_SIZE (`L3PRFQ_SIZE), + .PRFQ_STRIDE (`L3PRFQ_STRIDE), .FILL_INVALIDAOR_SIZE (`L3FILL_INVALIDAOR_SIZE), .SIMULATED_DRAM_LATENCY_CYCLES(`L3SIMULATED_DRAM_LATENCY_CYCLES) ) diff --git a/rtl/byte_enabled_simple_dual_port_ram.v b/rtl/byte_enabled_simple_dual_port_ram.v index 73b923f4..b4dcf5fc 100644 --- a/rtl/byte_enabled_simple_dual_port_ram.v +++ b/rtl/byte_enabled_simple_dual_port_ram.v @@ -25,9 +25,9 @@ module byte_enabled_simple_dual_port_ram always @(posedge clk, posedge reset) begin // TODO Clearing ram not currently supported on FPGA. if (reset) begin -`ifdef ASIC +// `ifdef ASIC for (ini = 0; ini < 32; ini = ini + 1) GPR[ini] <= 0; -`endif +// `endif end else if(we) begin integer thread_ind; diff --git a/rtl/simulate/simulator.cpp b/rtl/simulate/simulator.cpp index ba458e7e..4d7e4d8a 100644 --- a/rtl/simulate/simulator.cpp +++ b/rtl/simulate/simulator.cpp @@ -357,8 +357,8 @@ void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) { // #if NUMBER_CORES != 1 // send snoops for L2 flush - this->send_snoops(mem_addr, size); - this->wait(PIPELINE_FLUSH_LATENCY); + // this->send_snoops(mem_addr, size); + // this->wait(PIPELINE_FLUSH_LATENCY); // #endif }