diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so b/benchmarks/new_opencl/compiler/lib/libOpenCL.so deleted file mode 100644 index 70df4b33..00000000 Binary files a/benchmarks/new_opencl/compiler/lib/libOpenCL.so and /dev/null differ diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so b/benchmarks/new_opencl/compiler/lib/libOpenCL.so new file mode 120000 index 00000000..a2c38614 --- /dev/null +++ b/benchmarks/new_opencl/compiler/lib/libOpenCL.so @@ -0,0 +1 @@ +libOpenCL.so.2 \ No newline at end of file diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 deleted file mode 100644 index 70df4b33..00000000 Binary files a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 and /dev/null differ diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 new file mode 120000 index 00000000..e03f1782 --- /dev/null +++ b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2 @@ -0,0 +1 @@ +libOpenCL.so.2.5.0 \ No newline at end of file diff --git a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2.5.0 b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2.5.0 index 70df4b33..10ff7140 100644 Binary files a/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2.5.0 and b/benchmarks/new_opencl/compiler/lib/libOpenCL.so.2.5.0 differ diff --git a/benchmarks/new_opencl/runtime/lib/libOpenCL.so b/benchmarks/new_opencl/runtime/lib/libOpenCL.so index e6ea2f3b..8509aed5 100644 Binary files a/benchmarks/new_opencl/runtime/lib/libOpenCL.so and b/benchmarks/new_opencl/runtime/lib/libOpenCL.so differ diff --git a/benchmarks/new_opencl/runtime/lib/libOpenCL.so.2 b/benchmarks/new_opencl/runtime/lib/libOpenCL.so.2 index e6ea2f3b..8509aed5 100644 Binary files a/benchmarks/new_opencl/runtime/lib/libOpenCL.so.2 and b/benchmarks/new_opencl/runtime/lib/libOpenCL.so.2 differ diff --git a/benchmarks/new_opencl/runtime/lib/libOpenCL.so.2.5.0 b/benchmarks/new_opencl/runtime/lib/libOpenCL.so.2.5.0 index e6ea2f3b..8509aed5 100644 Binary files a/benchmarks/new_opencl/runtime/lib/libOpenCL.so.2.5.0 and b/benchmarks/new_opencl/runtime/lib/libOpenCL.so.2.5.0 differ diff --git a/benchmarks/new_opencl/vecadd/kernel.pocl b/benchmarks/new_opencl/vecadd/kernel.pocl index a010a9fc..2dab3524 100644 Binary files a/benchmarks/new_opencl/vecadd/kernel.pocl and b/benchmarks/new_opencl/vecadd/kernel.pocl differ diff --git a/driver/hw/Makefile b/driver/hw/Makefile index a5333bb0..7ac422aa 100644 --- a/driver/hw/Makefile +++ b/driver/hw/Makefile @@ -1,35 +1,36 @@ -BUILD_DIR=build_ase +ASE_BUILD_DIR=build_ase +FPGA_BUILD_DIR=build_fpga all: ase fpga ase: setup-ase - make -C $(BUILD_DIR) + make -C $(ASE_BUILD_DIR) fpga: setup-fpga - cd build_fpga && qsub-synth + cd $(FPGA_BUILD_DIR) && qsub-synth -setup-ase: build_ase/Makefile +setup-ase: $(ASE_BUILD_DIR)/Makefile -setup-fpga: build_fpga/build/dcp.qpf +setup-fpga: $(FPGA_BUILD_DIR)/build/dcp.qpf -build_ase/Makefile: - afu_sim_setup --s sources.txt build_ase +$(ASE_BUILD_DIR)/Makefile: + afu_sim_setup --s sources.txt $(ASE_BUILD_DIR) -build_fpga/build/dcp.qpf: - afu_synth_setup -s sources.txt build_fpga +$(FPGA_BUILD_DIR)/build/dcp.qpf: + afu_synth_setup -s sources.txt $(FPGA_BUILD_DIR) run-ase: - cd build_ase && make sim + cd $(ASE_BUILD_DIR) && make sim wave: - vsim -view build_ase/work/vsim.wlf -do wave.do + vsim -view $(ASE_BUILD_DIR)/work/vsim.wlf -do wave.do run-fpga: # TODO clean-ase: - rm -rf build_ase + rm -rf $(ASE_BUILD_DIR) clean-fpga: - rm -rf build_fpga \ No newline at end of file + rm -rf $(FPGA_BUILD_DIR) \ No newline at end of file diff --git a/driver/hw/sources.txt b/driver/hw/sources.txt index be07bef5..26201f2d 100644 --- a/driver/hw/sources.txt +++ b/driver/hw/sources.txt @@ -68,6 +68,7 @@ vortex_afu.json ../../rtl/VX_cache/VX_cache_miss_resrv.v ../../rtl/VX_cache/VX_fill_invalidator.v ../../rtl/VX_cache/VX_tag_data_structure.v +../../rtl/VX_cache/VX_prefetcher.v ../../rtl/cache/VX_generic_pe.v ../../rtl/cache/cache_set.v ../../rtl/cache/VX_d_cache.v diff --git a/driver/hw/vortex_afu.json b/driver/hw/vortex_afu.json index c8adc2e0..d42414fd 100644 --- a/driver/hw/vortex_afu.json +++ b/driver/hw/vortex_afu.json @@ -14,7 +14,7 @@ "cmd-type-read": 1, "cmd-type-write": 2, "cmd-type-run": 3, - "cmd-type-snoop": 4, + "cmd-type-clflush": 4, "afu-top-interface": { diff --git a/driver/hw/vortex_afu.sv b/driver/hw/vortex_afu.sv index c24dab2f..ec13c173 100644 --- a/driver/hw/vortex_afu.sv +++ b/driver/hw/vortex_afu.sv @@ -34,7 +34,9 @@ module vortex_afu #( ); localparam AVS_RD_QUEUE_SIZE = 16; -localparam VX_SNOOPING_DELAY = 300; + +localparam VX_SNOOP_DELAY = 300; +localparam VX_SNOOP_LEVELS = 2; localparam AFU_ID_L = 16'h0002; // AFU ID Lower localparam AFU_ID_H = 16'h0004; // AFU ID Higher @@ -42,7 +44,7 @@ localparam AFU_ID_H = 16'h0004; // AFU ID Higher localparam CMD_TYPE_READ = `AFU_IMAGE_CMD_TYPE_READ; localparam CMD_TYPE_WRITE = `AFU_IMAGE_CMD_TYPE_WRITE; localparam CMD_TYPE_RUN = `AFU_IMAGE_CMD_TYPE_RUN; -localparam CMD_TYPE_SNOOP = `AFU_IMAGE_CMD_TYPE_SNOOP; +localparam CMD_TYPE_CLFLUSH = `AFU_IMAGE_CMD_TYPE_CLFLUSH; localparam MMIO_CSR_CMD = `AFU_IMAGE_MMIO_CSR_CMD; localparam MMIO_CSR_STATUS = `AFU_IMAGE_MMIO_CSR_STATUS; @@ -52,13 +54,12 @@ localparam MMIO_CSR_DATA_SIZE = `AFU_IMAGE_MMIO_CSR_DATA_SIZE; logic [127:0] afu_id = `AFU_ACCEL_UUID; -typedef enum logic[2:0] { +typedef enum logic[3:0] { STATE_IDLE, STATE_READ, STATE_WRITE, STATE_RUN, - STATE_SNOOP1, - STATE_SNOOP2 + STATE_CLFLUSH } state_t; state_t state; @@ -192,7 +193,8 @@ logic [31:0] cci_write_ctr; logic [31:0] avs_read_ctr; logic [31:0] avs_write_ctr; logic [31:0] vx_snoop_ctr; -logic [31:0] vx_snoop_delay; +logic [9:0] vx_snoop_delay; +logic [1:0] vx_snoop_level; logic vx_reset; always_ff @(posedge clk) @@ -210,21 +212,21 @@ begin STATE_IDLE: begin case (csr_cmd) CMD_TYPE_READ: begin - $display("%t: CMD READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); state <= STATE_READ; end CMD_TYPE_WRITE: begin - $display("%t: CMD WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); state <= STATE_WRITE; end CMD_TYPE_RUN: begin - $display("%t: CMD START", $time); + $display("%t: STATE START", $time); vx_reset <= 1; state <= STATE_RUN; end - CMD_TYPE_SNOOP: begin - $display("%t: CMD SNOOP: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size); - state <= STATE_SNOOP1; + CMD_TYPE_CLFLUSH: begin + $display("%t: STATE CFLUSH: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size); + state <= STATE_CLFLUSH; end endcase end @@ -246,19 +248,13 @@ begin STATE_RUN: begin if (vx_ebreak) begin + // TODO: Add delay stage before returning to IDLE state <= STATE_IDLE; end end - STATE_SNOOP1: begin - if (vx_snoop_delay >= VX_SNOOPING_DELAY) - begin - state <= STATE_SNOOP2; - end - end - - STATE_SNOOP2: begin - if (vx_snoop_delay >= VX_SNOOPING_DELAY) + STATE_CLFLUSH: begin + if (vx_snoop_level >= VX_SNOOP_LEVELS) begin state <= STATE_IDLE; end @@ -320,7 +316,7 @@ begin end end - STATE_RUN: begin + STATE_RUN, STATE_CLFLUSH: begin if (vx_dram_req_read && !vx_dram_req_delay) begin @@ -348,15 +344,20 @@ begin end // Vortex DRAM requests stalling -assign vx_dram_req_delay = !((STATE_RUN == state) - && !avs_waitrequest - && !avs_raq_full - && !avs_rdq_full); -// Vortex DRAM fill response +logic vortex_enabled; + always_comb begin - vx_dram_fill_rsp = (STATE_RUN == state) && !avs_rdq_empty && vx_dram_fill_accept; + vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state); + vx_dram_req_delay = !vortex_enabled || avs_waitrequest || avs_raq_full || avs_rdq_full; +end + +// Vortex DRAM fill response + +always_comb +begin + vx_dram_fill_rsp = vortex_enabled && !avs_rdq_empty && vx_dram_fill_accept; vx_dram_fill_rsp_addr = (avs_raq_dout << 6); {>>{vx_dram_fill_rsp_data}} = avs_rdq_dout; end @@ -520,35 +521,39 @@ begin vx_snp_req <= 0; vx_snoop_ctr <= 0; vx_snoop_delay <= 0; + vx_snoop_level <= 0; end else begin if (STATE_IDLE == state) begin - vx_snoop_ctr <= 0; + vx_snoop_ctr <= 0; vx_snoop_delay <= 0; + vx_snoop_level <= 0; end vx_snp_req <= 0; - if ((STATE_SNOOP1 == state - || STATE_SNOOP2 == state) + if ((STATE_CLFLUSH == state) && vx_snoop_ctr < csr_data_size - && vx_snp_req_delay) + && vx_snoop_level < VX_SNOOP_LEVELS + && !vx_snp_req_delay) begin - vx_snp_req <= 1; - vx_snoop_ctr <= vx_snoop_ctr + 1; + vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6; + vx_snp_req <= 1; + vx_snoop_ctr <= vx_snoop_ctr + 1; end - if ((vx_snoop_ctr >= csr_data_size) - && (vx_snoop_delay < VX_SNOOPING_DELAY)) + if ((vx_snoop_ctr == csr_data_size) + && (vx_snoop_delay < VX_SNOOP_DELAY)) begin vx_snoop_delay <= vx_snoop_delay + 1; end - if (vx_snoop_delay >= VX_SNOOPING_DELAY) + if (vx_snoop_delay == VX_SNOOP_DELAY) begin - vx_snoop_ctr <= 0; + vx_snoop_ctr <= 0; vx_snoop_delay <= 0; + vx_snoop_level <= vx_snoop_level + 1; end end end diff --git a/driver/hw/wave.do b/driver/hw/wave.do index 58548f72..3c39919a 100644 --- a/driver/hw/wave.do +++ b/driver/hw/wave.do @@ -27,12 +27,17 @@ add wave -noupdate -label avs_raq_full /ase_top/ase_top_generic/platform_shim_cc add wave -noupdate -label avs_rdq_full /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_full add wave -noupdate -label avs_raq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_raq_empty add wave -noupdate -label avs_rdq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_empty +add wave -noupdate -label vortex_enabled /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vortex_enabled +add wave -noupdate -label vx_reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/reset +add wave -noupdate -label vx_dram_req_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_read add wave -noupdate -label vx_dram_req_write /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_write add wave -noupdate -label vx_dram_req_delay /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_delay -add wave -noupdate -label vx_dram_req_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_read -add wave -noupdate -label vx_reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/reset +add wave -noupdate -label vx_dram_req_addr -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_addr +add wave -noupdate -label vx_draw_req_data -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_data add wave -noupdate -label out_dram_fill_rsp /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_dram_fill_rsp add wave -noupdate -label out_dram_fill_accept /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_dram_fill_accept +add wave -noupdate -label vx_draw_fill_rsp_data -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_fill_rsp_data +add wave -noupdate -label vx_dram_fill_rsp_addr -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_fill_rsp_addr add wave -noupdate -label llc_snp_req /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/llc_snp_req add wave -noupdate -label llc_snp_req_delay /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/llc_snp_req_delay add wave -noupdate -label out_break /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_ebreak @@ -45,7 +50,7 @@ add wave -noupdate -label warp_stalled {/ase_top/ase_top_generic/platform_shim_c add wave -noupdate -label warp_lock {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/warp_lock} add wave -noupdate -label use_active {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/use_active} TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {66234495 ps} 0} +WaveRestoreCursors {{Cursor 2} {293228800 ps} 0} quietly wave cursor active 1 configure wave -namecolwidth 195 configure wave -valuecolwidth 100 @@ -61,4 +66,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ps update -WaveRestoreZoom {66041656 ps} {66406344 ps} +WaveRestoreZoom {293046456 ps} {293411144 ps} diff --git a/driver/sw/opae/vortex.cpp b/driver/sw/opae/vortex.cpp index 11d29ee3..62a80ccd 100755 --- a/driver/sw/opae/vortex.cpp +++ b/driver/sw/opae/vortex.cpp @@ -22,8 +22,8 @@ #define CMD_TYPE_READ AFU_IMAGE_CMD_TYPE_READ #define CMD_TYPE_WRITE AFU_IMAGE_CMD_TYPE_WRITE -#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN -#define CMD_TYPE_SNOOP AFU_IMAGE_CMD_TYPE_SNOOP +#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN +#define CMD_TYPE_CLFLUSH AFU_IMAGE_CMD_TYPE_CLFLUSH #define MMIO_CSR_CMD (AFU_IMAGE_MMIO_CSR_CMD * 4) #define MMIO_CSR_STATUS (AFU_IMAGE_MMIO_CSR_STATUS * 4) @@ -313,7 +313,7 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) { CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_SNOOP)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH)); // Wait for the write operation to finish if (vx_ready_wait(hdevice, -1) != 0) diff --git a/driver/tests/basic/Makefile b/driver/tests/basic/Makefile index d04e756f..786b0491 100644 --- a/driver/tests/basic/Makefile +++ b/driver/tests/basic/Makefile @@ -1,3 +1,21 @@ +RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops) +VX_RT_PATH ?= $(wildcard ../../../runtime) + +VX_CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc +VX_CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ +VX_DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump +VX_CPY = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy + +VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c +VX_STR = $(VX_RT_PATH)/startup/vx_start.S +VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s +VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c +VX_API = $(VX_RT_PATH)/vx_api/vx_api.c +VX_FIO = $(VX_RT_PATH)/fileio/fileio.s + +VX_CFLAGS = -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections + +VX_SRCS = kernel.c CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors @@ -11,6 +29,18 @@ SRCS = basic.cpp all: $(PROJECT) +kernel.dump: kernel.elf + $(VX_DMP) -D kernel.elf > kernel.dump + +kernel.hex: kernel.elf + $(VX_CPY) -O ihex kernel.elf kernel.hex + +kernel.bin: kernel.elf + $(VX_CPY) -O binary kernel.elf kernel.bin + +kernel.elf: $(SRCS) + $(VX_CC) $(VX_CFLAGS) $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_SRCS) -I$(VX_RT_PATH) -o kernel.elf + $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../sw/dummy -lvortex -o $@ @@ -18,7 +48,7 @@ run-fpga: $(PROJECT) LD_LIBRARY_PATH=../../sw/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) run-ase: $(PROJECT) - ASE_LOG=0 LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -t 1 + ASE_LOG=0 LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) run-rtlsim: $(PROJECT) LD_LIBRARY_PATH=../../sw/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) diff --git a/driver/tests/basic/basic b/driver/tests/basic/basic index 1ec3ae80..abc005ef 100755 Binary files a/driver/tests/basic/basic and b/driver/tests/basic/basic differ diff --git a/driver/tests/basic/basic.cpp b/driver/tests/basic/basic.cpp index 9e13183b..684d655d 100755 --- a/driver/tests/basic/basic.cpp +++ b/driver/tests/basic/basic.cpp @@ -27,11 +27,11 @@ uint64_t shuffle(int i, uint64_t value) { return (value << i) | (value & ((1 << i)-1));; } -int run_test_0(vx_buffer_h sbuf, - vx_buffer_h dbuf, - uint32_t address, - uint64_t value, - int num_blocks) { +int run_memcopy_test(vx_buffer_h sbuf, + vx_buffer_h dbuf, + uint32_t address, + uint64_t value, + int num_blocks) { int ret; int errors = 0; @@ -73,8 +73,29 @@ int run_test_0(vx_buffer_h sbuf, return 0; } -int run_test_1(vx_device_h device, const char* program) { +int run_kernel_test(vx_device_h device, + vx_buffer_h sbuf, + vx_buffer_h dbuf, + const char* program) { int ret; + int errors = 0; + + uint64_t seed = 0x0badf00d40ff40ff; + int num_blocks = 4; + + unsigned src_dev_addr = 0x10000000; + unsigned dest_dev_addr = 0x20000000; + + // write sbuf data + for (int i = 0; i < 8 * num_blocks; ++i) { + ((uint64_t*)vx_host_ptr(sbuf))[i] = shuffle(i, seed); + } + + // write buffer to local memory + std::cout << "write buffer to local memory" << std::endl; + ret = vx_copy_to_dev(sbuf, src_dev_addr, 64 * num_blocks, 0); + if (ret != 0) + return ret; // upload program std::cout << "upload program" << std::endl; @@ -97,6 +118,37 @@ int run_test_1(vx_device_h device, const char* program) { return ret; } + // flush the caches + std::cout << "flush the caches" << std::endl; + ret = vx_flush_caches(device, dest_dev_addr, 64 * num_blocks); + if (ret != 0) { + return ret; + } + + // read buffer from local memory + std::cout << "read buffer from local memory" << std::endl; + ret = vx_copy_from_dev(dbuf, dest_dev_addr, 64 * num_blocks, 0); + if (ret != 0) + return ret; + + // verify result + std::cout << "verify result" << std::endl; + for (int i = 0; i < 8 * num_blocks; ++i) { + auto curr = ((uint64_t*)vx_host_ptr(dbuf))[i]; + auto ref = shuffle(i, seed); + if (curr != ref) { + std::cout << "error @ " << std::hex << (dest_dev_addr + 64 * i) + << ": actual " << curr << ", expected " << ref << std::endl; + ++errors; + } + } + + if (errors != 0) { + std::cout << "Found " << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + return 0; } @@ -147,27 +199,15 @@ int main(int argc, char *argv[]) { // run tests if (0 == test || -1 == test) { - std::cout << "run test suite 0" << std::endl; + std::cout << "run memcopy test" << std::endl; - ret = run_test_0(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1); + ret = run_memcopy_test(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1); if (ret != 0) { cleanup(); return ret; } - ret = run_test_0(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 2); - if (ret != 0) { - cleanup(); - return ret; - } - - ret = run_test_0(sbuf, dbuf, 0x20000000, 0xff00ff00ff00ff00, 4); - if (ret != 0) { - cleanup(); - return ret; - } - - ret = run_test_0(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8); + ret = run_memcopy_test(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8); if (ret != 0) { cleanup(); return ret; @@ -175,17 +215,8 @@ int main(int argc, char *argv[]) { } if (1 == test || -1 == test) { - std::cout << "run test suite 1" << std::endl; - ret = run_test_1(device, "rv32ui-p-lw.bin"); - if (ret != 0) { - cleanup(); - return ret; - } - } - - if (2 == test || -1 == test) { - std::cout << "run test suite 1" << std::endl; - ret = run_test_1(device, "rv32ui-p-sw.bin"); + std::cout << "run kernel test" << std::endl; + ret = run_kernel_test(device, sbuf, dbuf, "kernel.bin"); if (ret != 0) { cleanup(); return ret; diff --git a/driver/tests/basic/rv32ui-p-sw.bin b/driver/tests/basic/kernel.bin old mode 100644 new mode 100755 similarity index 60% rename from driver/tests/basic/rv32ui-p-sw.bin rename to driver/tests/basic/kernel.bin index 4a3db3b8..55e20d29 Binary files a/driver/tests/basic/rv32ui-p-sw.bin and b/driver/tests/basic/kernel.bin differ diff --git a/driver/tests/basic/kernel.c b/driver/tests/basic/kernel.c new file mode 100644 index 00000000..c79d1056 --- /dev/null +++ b/driver/tests/basic/kernel.c @@ -0,0 +1,9 @@ +#include + +void main() { + int64_t* x = (int64_t*)0x10000000; + int64_t* y = (int64_t*)0x20000000; + for (int i = 0; i < 8 * 4; ++i) { + y[i] = x[i]; + } +} \ No newline at end of file diff --git a/driver/tests/basic/rv32ui-p-lw.bin b/driver/tests/basic/rv32ui-p-lw.bin deleted file mode 100644 index 51c28eaa..00000000 Binary files a/driver/tests/basic/rv32ui-p-lw.bin and /dev/null differ diff --git a/driver/tests/demo/Makefile b/driver/tests/demo/Makefile index eeaa2756..34a283dc 100644 --- a/driver/tests/demo/Makefile +++ b/driver/tests/demo/Makefile @@ -46,7 +46,7 @@ run-fpga: $(PROJECT) LD_LIBRARY_PATH=../../sw/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16 run-ase: $(PROJECT) - ASE_LOG=0 LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16 + ASE_LOG=0 LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 1 run-rtlsim: $(PROJECT) LD_LIBRARY_PATH=../../sw/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16 diff --git a/driver/tests/demo/kernel.bin b/driver/tests/demo/kernel.bin index cbc44de8..8cf7ac8b 100755 Binary files a/driver/tests/demo/kernel.bin and b/driver/tests/demo/kernel.bin differ diff --git a/rtl/VX_cache/VX_cache_dram_req_arb.v b/rtl/VX_cache/VX_cache_dram_req_arb.v index 0db1f560..b720dc63 100644 --- a/rtl/VX_cache/VX_cache_dram_req_arb.v +++ b/rtl/VX_cache/VX_cache_dram_req_arb.v @@ -82,6 +82,9 @@ module VX_cache_dram_req_arb wire pref_pop; wire pref_valid; wire[31:0] pref_addr; + + wire dwb_valid; + wire dfqq_req; assign pref_pop = !dwb_valid && !dfqq_req && !dram_req_delay && pref_valid; VX_prefetcher #( @@ -105,10 +108,8 @@ module VX_cache_dram_req_arb ); - wire dfqq_req; wire[31:0] dfqq_req_addr; - wire dfqq_empty; - wire dwb_valid; + wire dfqq_empty; wire dfqq_pop = !dwb_valid && dfqq_req && !dram_req_delay; // If no dwb, and dfqq has valids, then pop wire dfqq_push = (|per_bank_dram_fill_req); diff --git a/rtl/VX_generic_queue_ll.v b/rtl/VX_generic_queue_ll.v index 75215486..5349f649 100644 --- a/rtl/VX_generic_queue_ll.v +++ b/rtl/VX_generic_queue_ll.v @@ -6,45 +6,47 @@ module VX_generic_queue_ll parameter SIZE = 277 ) ( - input wire clk, - input wire reset, - input wire push, - input wire[DATAW-1:0] in_data, + input wire clk, + input wire reset, + input wire push, + input wire [DATAW-1:0] in_data, - input wire pop, - output wire[DATAW-1:0] out_data, - output wire empty, - output wire full -); + input wire pop, + output wire [DATAW-1:0] out_data, + output wire empty, + output wire full +); /* verilator lint_off WIDTH */ if (SIZE == 0) begin + assign empty = 1; assign out_data = 0; assign full = 0; - end else begin - `ifdef QUEUE_FORCE_MLAB + end else begin // (SIZE > 0) + + `ifdef QUEUE_FORCE_MLAB (* syn_ramstyle = "mlab" *) reg[DATAW-1:0] data[SIZE-1:0]; - `else - reg[DATAW-1:0] data[SIZE-1:0]; - `endif + `else + reg[ DATAW-1:0] data[SIZE-1:0]; + `endif - reg[DATAW-1:0] curr_r, head_r; - reg[$clog2(SIZE+1)-1:0] size_r; - reg[$clog2(SIZE)-1:0] wr_ctr_r; - reg[$clog2(SIZE)-1:0] rd_ptr_r, rd_next_ptr_r; - reg empty_r, full_r, bypass_r; - wire reading, writing; + reg [DATAW-1:0] head_r; + reg [$clog2(SIZE+1)-1:0] size_r; + wire reading; + wire writing; assign reading = pop && !empty; assign writing = push && !full; if (SIZE == 1) begin + always @(posedge clk) begin if (reset) begin - size_r <= 0; + size_r <= 0; + head_r <= 0; end else begin if (writing && !reading) begin size_r <= 1; @@ -59,9 +61,19 @@ module VX_generic_queue_ll end assign out_data = head_r; - assign empty = (size_r == 0); - assign full = (size_r != 0) && !pop; - end else begin + assign empty = (size_r == 0); + assign full = (size_r != 0) && !pop; + + end else begin // (SIZE > 1) + + reg [DATAW-1:0] curr_r; + reg [$clog2(SIZE)-1:0] wr_ctr_r; + reg [$clog2(SIZE)-1:0] rd_ptr_r; + reg [$clog2(SIZE)-1:0] rd_next_ptr_r; + reg empty_r; + reg full_r; + reg bypass_r; + always @(posedge clk) begin if (reset) begin wr_ctr_r <= 0; @@ -99,9 +111,10 @@ module VX_generic_queue_ll always @(posedge clk) begin if (reset) begin - rd_ptr_r <= 0; + curr_r <= 0; + rd_ptr_r <= 0; rd_next_ptr_r <= 1; - bypass_r <= 0; + bypass_r <= 0; end else begin if (reading) begin if (SIZE == 2) begin @@ -123,7 +136,6 @@ module VX_generic_queue_ll assign empty = empty_r; assign full = full_r; end - end /* verilator lint_on WIDTH */