diff --git a/driver/hw/sources.txt b/driver/hw/sources.txt index be07bef5..26201f2d 100644 --- a/driver/hw/sources.txt +++ b/driver/hw/sources.txt @@ -68,6 +68,7 @@ vortex_afu.json ../../rtl/VX_cache/VX_cache_miss_resrv.v ../../rtl/VX_cache/VX_fill_invalidator.v ../../rtl/VX_cache/VX_tag_data_structure.v +../../rtl/VX_cache/VX_prefetcher.v ../../rtl/cache/VX_generic_pe.v ../../rtl/cache/cache_set.v ../../rtl/cache/VX_d_cache.v diff --git a/driver/hw/vortex_afu.json b/driver/hw/vortex_afu.json index c8adc2e0..d42414fd 100644 --- a/driver/hw/vortex_afu.json +++ b/driver/hw/vortex_afu.json @@ -14,7 +14,7 @@ "cmd-type-read": 1, "cmd-type-write": 2, "cmd-type-run": 3, - "cmd-type-snoop": 4, + "cmd-type-clflush": 4, "afu-top-interface": { diff --git a/driver/hw/vortex_afu.sv b/driver/hw/vortex_afu.sv index 0824125e..ec13c173 100644 --- a/driver/hw/vortex_afu.sv +++ b/driver/hw/vortex_afu.sv @@ -34,7 +34,9 @@ module vortex_afu #( ); localparam AVS_RD_QUEUE_SIZE = 16; -localparam VX_SNOOPING_DELAY = 300; + +localparam VX_SNOOP_DELAY = 300; +localparam VX_SNOOP_LEVELS = 2; localparam AFU_ID_L = 16'h0002; // AFU ID Lower localparam AFU_ID_H = 16'h0004; // AFU ID Higher @@ -42,7 +44,7 @@ localparam AFU_ID_H = 16'h0004; // AFU ID Higher localparam CMD_TYPE_READ = `AFU_IMAGE_CMD_TYPE_READ; localparam CMD_TYPE_WRITE = `AFU_IMAGE_CMD_TYPE_WRITE; localparam CMD_TYPE_RUN = `AFU_IMAGE_CMD_TYPE_RUN; -localparam CMD_TYPE_SNOOP = `AFU_IMAGE_CMD_TYPE_SNOOP; +localparam CMD_TYPE_CLFLUSH = `AFU_IMAGE_CMD_TYPE_CLFLUSH; localparam MMIO_CSR_CMD = `AFU_IMAGE_MMIO_CSR_CMD; localparam MMIO_CSR_STATUS = `AFU_IMAGE_MMIO_CSR_STATUS; @@ -52,13 +54,12 @@ localparam MMIO_CSR_DATA_SIZE = `AFU_IMAGE_MMIO_CSR_DATA_SIZE; logic [127:0] afu_id = `AFU_ACCEL_UUID; -typedef enum logic[2:0] { +typedef enum logic[3:0] { STATE_IDLE, STATE_READ, STATE_WRITE, STATE_RUN, - STATE_SNOOP1, - STATE_SNOOP2 + STATE_CLFLUSH } state_t; state_t state; @@ -192,7 +193,8 @@ logic [31:0] cci_write_ctr; logic [31:0] avs_read_ctr; logic [31:0] avs_write_ctr; logic [31:0] vx_snoop_ctr; -logic [31:0] vx_snoop_delay; +logic [9:0] vx_snoop_delay; +logic [1:0] vx_snoop_level; logic vx_reset; always_ff @(posedge clk) @@ -210,21 +212,21 @@ begin STATE_IDLE: begin case (csr_cmd) CMD_TYPE_READ: begin - $display("%t: CMD READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); state <= STATE_READ; end CMD_TYPE_WRITE: begin - $display("%t: CMD WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); state <= STATE_WRITE; end CMD_TYPE_RUN: begin - $display("%t: CMD START", $time); + $display("%t: STATE START", $time); vx_reset <= 1; state <= STATE_RUN; end - CMD_TYPE_SNOOP: begin - $display("%t: CMD SNOOP: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size); - state <= STATE_SNOOP1; + CMD_TYPE_CLFLUSH: begin + $display("%t: STATE CFLUSH: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size); + state <= STATE_CLFLUSH; end endcase end @@ -251,16 +253,8 @@ begin end end - STATE_SNOOP1: begin - if (vx_snoop_delay >= VX_SNOOPING_DELAY) - begin - // TODO: Allow both RUN and SNOOP states to use the AVS bus - state <= STATE_SNOOP2; - end - end - - STATE_SNOOP2: begin - if (vx_snoop_delay >= VX_SNOOPING_DELAY) + STATE_CLFLUSH: begin + if (vx_snoop_level >= VX_SNOOP_LEVELS) begin state <= STATE_IDLE; end @@ -322,7 +316,7 @@ begin end end - STATE_RUN: begin + STATE_RUN, STATE_CLFLUSH: begin if (vx_dram_req_read && !vx_dram_req_delay) begin @@ -350,15 +344,20 @@ begin end // Vortex DRAM requests stalling -assign vx_dram_req_delay = !((STATE_RUN == state) - && !avs_waitrequest - && !avs_raq_full - && !avs_rdq_full); -// Vortex DRAM fill response +logic vortex_enabled; + always_comb begin - vx_dram_fill_rsp = (STATE_RUN == state) && !avs_rdq_empty && vx_dram_fill_accept; + vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state); + vx_dram_req_delay = !vortex_enabled || avs_waitrequest || avs_raq_full || avs_rdq_full; +end + +// Vortex DRAM fill response + +always_comb +begin + vx_dram_fill_rsp = vortex_enabled && !avs_rdq_empty && vx_dram_fill_accept; vx_dram_fill_rsp_addr = (avs_raq_dout << 6); {>>{vx_dram_fill_rsp_data}} = avs_rdq_dout; end @@ -522,35 +521,39 @@ begin vx_snp_req <= 0; vx_snoop_ctr <= 0; vx_snoop_delay <= 0; + vx_snoop_level <= 0; end else begin if (STATE_IDLE == state) begin - vx_snoop_ctr <= 0; + vx_snoop_ctr <= 0; vx_snoop_delay <= 0; + vx_snoop_level <= 0; end vx_snp_req <= 0; - if ((STATE_SNOOP1 == state - || STATE_SNOOP2 == state) + if ((STATE_CLFLUSH == state) && vx_snoop_ctr < csr_data_size - && vx_snp_req_delay) + && vx_snoop_level < VX_SNOOP_LEVELS + && !vx_snp_req_delay) begin - vx_snp_req <= 1; - vx_snoop_ctr <= vx_snoop_ctr + 1; + vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6; + vx_snp_req <= 1; + vx_snoop_ctr <= vx_snoop_ctr + 1; end - if ((vx_snoop_ctr >= csr_data_size) - && (vx_snoop_delay < VX_SNOOPING_DELAY)) + if ((vx_snoop_ctr == csr_data_size) + && (vx_snoop_delay < VX_SNOOP_DELAY)) begin vx_snoop_delay <= vx_snoop_delay + 1; end - if (vx_snoop_delay >= VX_SNOOPING_DELAY) + if (vx_snoop_delay == VX_SNOOP_DELAY) begin - vx_snoop_ctr <= 0; + vx_snoop_ctr <= 0; vx_snoop_delay <= 0; + vx_snoop_level <= vx_snoop_level + 1; end end end diff --git a/driver/hw/wave.do b/driver/hw/wave.do index b8d71935..3c39919a 100644 --- a/driver/hw/wave.do +++ b/driver/hw/wave.do @@ -27,6 +27,7 @@ add wave -noupdate -label avs_raq_full /ase_top/ase_top_generic/platform_shim_cc add wave -noupdate -label avs_rdq_full /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_full add wave -noupdate -label avs_raq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_raq_empty add wave -noupdate -label avs_rdq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_empty +add wave -noupdate -label vortex_enabled /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vortex_enabled add wave -noupdate -label vx_reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/reset add wave -noupdate -label vx_dram_req_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_read add wave -noupdate -label vx_dram_req_write /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_write @@ -49,7 +50,7 @@ add wave -noupdate -label warp_stalled {/ase_top/ase_top_generic/platform_shim_c add wave -noupdate -label warp_lock {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/warp_lock} add wave -noupdate -label use_active {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/use_active} TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {620643200 ps} 0} +WaveRestoreCursors {{Cursor 2} {293228800 ps} 0} quietly wave cursor active 1 configure wave -namecolwidth 195 configure wave -valuecolwidth 100 @@ -65,4 +66,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ps update -WaveRestoreZoom {620460856 ps} {620825544 ps} +WaveRestoreZoom {293046456 ps} {293411144 ps} diff --git a/driver/sw/opae/vortex.cpp b/driver/sw/opae/vortex.cpp index 11d29ee3..62a80ccd 100755 --- a/driver/sw/opae/vortex.cpp +++ b/driver/sw/opae/vortex.cpp @@ -22,8 +22,8 @@ #define CMD_TYPE_READ AFU_IMAGE_CMD_TYPE_READ #define CMD_TYPE_WRITE AFU_IMAGE_CMD_TYPE_WRITE -#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN -#define CMD_TYPE_SNOOP AFU_IMAGE_CMD_TYPE_SNOOP +#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN +#define CMD_TYPE_CLFLUSH AFU_IMAGE_CMD_TYPE_CLFLUSH #define MMIO_CSR_CMD (AFU_IMAGE_MMIO_CSR_CMD * 4) #define MMIO_CSR_STATUS (AFU_IMAGE_MMIO_CSR_STATUS * 4) @@ -313,7 +313,7 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) { CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_SNOOP)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH)); // Wait for the write operation to finish if (vx_ready_wait(hdevice, -1) != 0) diff --git a/driver/tests/basic/basic b/driver/tests/basic/basic index 1ec3ae80..3a7a0309 100755 Binary files a/driver/tests/basic/basic and b/driver/tests/basic/basic differ diff --git a/driver/tests/basic/basic.cpp b/driver/tests/basic/basic.cpp index c5cf8fdc..c73535c8 100755 --- a/driver/tests/basic/basic.cpp +++ b/driver/tests/basic/basic.cpp @@ -27,7 +27,7 @@ uint64_t shuffle(int i, uint64_t value) { return (value << i) | (value & ((1 << i)-1));; } -int run_memcpy_test(vx_buffer_h sbuf, +int run_memcopy_test(vx_buffer_h sbuf, vx_buffer_h dbuf, uint32_t address, uint64_t value, @@ -105,7 +105,7 @@ int run_snoop_test(vx_device_h device) { // upload program std::cout << "upload program" << std::endl; - ret = vx_upload_kernel_file(device, "rv32ui-p-lw.bin"); + ret = vx_upload_kernel_file(device, "snooping.bin"); if (ret != 0) { return ret; } @@ -124,9 +124,9 @@ int run_snoop_test(vx_device_h device) { return ret; } - // send snooping request + // flush the caches std::cout << "flush the caches" << std::endl; - ret = vx_flush_caches(device, 0x80002000, 64); + ret = vx_flush_caches(device, 0x10000000, 64*8); if (ret != 0) { return ret; } @@ -181,15 +181,15 @@ int main(int argc, char *argv[]) { // run tests if (0 == test || -1 == test) { - std::cout << "run memcpy test" << std::endl; + std::cout << "run memcopy test" << std::endl; - ret = run_memcpy_test(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1); + ret = run_memcopy_test(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1); if (ret != 0) { cleanup(); return ret; } - ret = run_memcpy_test(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8); + ret = run_memcopy_test(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8); if (ret != 0) { cleanup(); return ret; diff --git a/driver/tests/basic/snooping.bin b/driver/tests/basic/snooping.bin new file mode 100644 index 00000000..c4b15898 Binary files /dev/null and b/driver/tests/basic/snooping.bin differ diff --git a/rtl/VX_generic_queue_ll.v b/rtl/VX_generic_queue_ll.v index 75215486..5349f649 100644 --- a/rtl/VX_generic_queue_ll.v +++ b/rtl/VX_generic_queue_ll.v @@ -6,45 +6,47 @@ module VX_generic_queue_ll parameter SIZE = 277 ) ( - input wire clk, - input wire reset, - input wire push, - input wire[DATAW-1:0] in_data, + input wire clk, + input wire reset, + input wire push, + input wire [DATAW-1:0] in_data, - input wire pop, - output wire[DATAW-1:0] out_data, - output wire empty, - output wire full -); + input wire pop, + output wire [DATAW-1:0] out_data, + output wire empty, + output wire full +); /* verilator lint_off WIDTH */ if (SIZE == 0) begin + assign empty = 1; assign out_data = 0; assign full = 0; - end else begin - `ifdef QUEUE_FORCE_MLAB + end else begin // (SIZE > 0) + + `ifdef QUEUE_FORCE_MLAB (* syn_ramstyle = "mlab" *) reg[DATAW-1:0] data[SIZE-1:0]; - `else - reg[DATAW-1:0] data[SIZE-1:0]; - `endif + `else + reg[ DATAW-1:0] data[SIZE-1:0]; + `endif - reg[DATAW-1:0] curr_r, head_r; - reg[$clog2(SIZE+1)-1:0] size_r; - reg[$clog2(SIZE)-1:0] wr_ctr_r; - reg[$clog2(SIZE)-1:0] rd_ptr_r, rd_next_ptr_r; - reg empty_r, full_r, bypass_r; - wire reading, writing; + reg [DATAW-1:0] head_r; + reg [$clog2(SIZE+1)-1:0] size_r; + wire reading; + wire writing; assign reading = pop && !empty; assign writing = push && !full; if (SIZE == 1) begin + always @(posedge clk) begin if (reset) begin - size_r <= 0; + size_r <= 0; + head_r <= 0; end else begin if (writing && !reading) begin size_r <= 1; @@ -59,9 +61,19 @@ module VX_generic_queue_ll end assign out_data = head_r; - assign empty = (size_r == 0); - assign full = (size_r != 0) && !pop; - end else begin + assign empty = (size_r == 0); + assign full = (size_r != 0) && !pop; + + end else begin // (SIZE > 1) + + reg [DATAW-1:0] curr_r; + reg [$clog2(SIZE)-1:0] wr_ctr_r; + reg [$clog2(SIZE)-1:0] rd_ptr_r; + reg [$clog2(SIZE)-1:0] rd_next_ptr_r; + reg empty_r; + reg full_r; + reg bypass_r; + always @(posedge clk) begin if (reset) begin wr_ctr_r <= 0; @@ -99,9 +111,10 @@ module VX_generic_queue_ll always @(posedge clk) begin if (reset) begin - rd_ptr_r <= 0; + curr_r <= 0; + rd_ptr_r <= 0; rd_next_ptr_r <= 1; - bypass_r <= 0; + bypass_r <= 0; end else begin if (reading) begin if (SIZE == 2) begin @@ -123,7 +136,6 @@ module VX_generic_queue_ll assign empty = empty_r; assign full = full_r; end - end /* verilator lint_on WIDTH */