diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 82f48f97..3ffe4f49 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -15,7 +15,7 @@ extern int vx_dev_caps(int caps_id) { case VX_CAPS_MAX_THREADS: return NUM_THREADS; case VX_CAPS_CACHE_LINESIZE: - return GLOBAL_BLOCK_SIZE; + return 64; case VX_CAPS_LOCAL_MEM_SIZE: return 0xffffffff; case VX_CAPS_ALLOC_BASE_ADDR: diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index 62a80ccd..43816c30 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -46,9 +47,14 @@ typedef struct vx_buffer_ { size_t size; } vx_buffer_t; -static size_t align_size(size_t size) { - uint32_t cache_block_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); - return cache_block_size * ((size + cache_block_size - 1) / cache_block_size); +inline size_t align_size(size_t size, size_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return (size + alignment - 1) & ~(alignment - 1); +} + +inline bool is_aligned(size_t addr, size_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return 0 == (addr & (alignment - 1)); } /////////////////////////////////////////////////////////////////////////////// @@ -132,9 +138,11 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) return -1; vx_device_t *device = ((vx_device_t*)hdevice); - - size_t asize = align_size(size); + + int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); + + size_t asize = align_size(size, line_size); if (device->mem_allocation + asize > dev_mem_size) return -1; @@ -158,7 +166,9 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb vx_device_t *device = ((vx_device_t*)hdevice); - size_t asize = align_size(size); + int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); + + size_t asize = align_size(size, line_size); res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0); if (FPGA_OK != res) { @@ -249,20 +259,35 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si || 0 >= size) return -1; - vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer); + vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); + int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); + size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); + + // check alignment + if (!is_aligned(dev_maddr, line_size)) + return -1; + if (!is_aligned(size, line_size)) + return -1; + if (!is_aligned(buffer->io_addr + src_offset, line_size)) + return -1; + // bound checking if (size + src_offset > buffer->size) return -1; + if (dev_maddr + size > dev_mem_size) + return -1; // Ensure ready for new command if (vx_ready_wait(buffer->hdevice, -1) != 0) return -1; - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, buffer->io_addr + src_offset)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size)); + auto ls_shift = (int)std::log2(line_size); + + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + src_offset) >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, (dev_maddr >> ls_shift) )); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size >> ls_shift)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_WRITE)); // Wait for the write operation to finish @@ -277,20 +302,35 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, || 0 >= size) return -1; - vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer); + vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); + int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); + size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); + + // check alignment + if (!is_aligned(dev_maddr, line_size)) + return -1; + if (!is_aligned(size, line_size)) + return -1; + if (!is_aligned(buffer->io_addr + dest_offset, line_size)) + return -1; + // bound checking if (size + dest_offset > buffer->size) return -1; + if (dev_maddr + size > dev_mem_size) + return -1; // Ensure ready for new command if (vx_ready_wait(buffer->hdevice, -1) != 0) return -1; - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, buffer->io_addr + dest_offset)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size)); + auto ls_shift = (int)std::log2(line_size); + + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + dest_offset) >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, (dev_maddr) >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size >> ls_shift)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_READ)); // Wait for the write operation to finish @@ -307,12 +347,22 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) { vx_device_t* device = ((vx_device_t*)hdevice); + int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); + + // check alignment + if (!is_aligned(dev_maddr, line_size)) + return -1; + if (!is_aligned(size, line_size)) + return -1; + // Ensure ready for new command if (vx_ready_wait(hdevice, -1) != 0) return -1; - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size)); + auto ls_shift = (int)std::log2(line_size); + + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size >> ls_shift)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH)); // Wait for the write operation to finish diff --git a/hw/opae/README b/hw/opae/README index 5030af65..90307f37 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -38,13 +38,20 @@ make clean make ./basic -#ASE build instructions +# +# ASE build instructions +# + +# Acquire a sever node for running ASE simulations +qsub-sim + +# modify "vsim_run.tcl" to dump VCD trace vcd file vortex.vcd vcd add -r /*/Vortex/hw/rtl/* run -all -#compress +# compress VCD trace tar -zcvf vortex.vcd.tar.gz work/vortex.vcd -# decompress +# decompress VCD trace tar -zxvf vortex.vcd.tar.gz vortex.vcd \ No newline at end of file diff --git a/hw/opae/run_ase.sh b/hw/opae/run_ase.sh new file mode 100755 index 00000000..0cd953db --- /dev/null +++ b/hw/opae/run_ase.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +SCRIPT_DIR=$PWD +PROGRAM=$(basename "$1") +PROGRAM_DIR=`dirname $1` + +# Export ASE_WORKDIR variable +export ASE_WORKDIR=$SCRIPT_DIR/build_ase/work + +shift 1 + +# cleanup incomplete runs +rm -rf $ASE_WORKDIR/.app_lock.pid $ASE_WORKDIR/.ase_ready.pid + +# Start Simulator in background +pushd $SCRIPT_DIR/build_ase +make sim & +popd + +# Wait for simulator readiness +# When .ase_ready is created in the $ASE_WORKDIR, ASE is ready for simulation +while [! -f $ASE_WORKDIR/.ase_ready.pid] +do + sleep 1 +done + +# run application +pushd $PROGRAM_DIR +ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$LD_LIBRARY_PATH ./$PROGRAM $* +popd \ No newline at end of file diff --git a/hw/opae/sources.txt b/hw/opae/sources.txt index 9f0c2827..73f6f739 100644 --- a/hw/opae/sources.txt +++ b/hw/opae/sources.txt @@ -1,6 +1,6 @@ vortex_afu.json -+define+GLOBAL_BLOCK_SIZE_BYTES=64 ++define+GLOBAL_BLOCK_SIZE=64 +incdir+. +incdir+../rtl @@ -12,31 +12,42 @@ vortex_afu.json ../rtl/VX_user_config.vh ../rtl/VX_config.vh ../rtl/VX_define.vh -../rtl/cache/VX_cache_config.vh -../rtl/interfaces/VX_exec_unit_req_if.v +../rtl/cache/VX_cache_config.vh +../rtl/cache/VX_cache.v +../rtl/cache/VX_bank.v +../rtl/cache/VX_cache_core_rsp_merge.v +../rtl/cache/VX_cache_core_req_bank_sel.v +../rtl/cache/VX_cache_dram_req_arb.v +../rtl/cache/VX_cache_dfq_queue.v +../rtl/cache/VX_cache_req_queue.v +../rtl/cache/VX_cache_miss_resrv.v +../rtl/cache/VX_fill_invalidator.v +../rtl/cache/VX_snp_fwd_arb.v +../rtl/cache/VX_tag_data_access.v +../rtl/cache/VX_tag_data_structure.v +../rtl/cache/VX_prefetcher.v + ../rtl/interfaces/VX_branch_rsp_if.v -../rtl/interfaces/VX_inst_meta_if.v -../rtl/interfaces/VX_join_if.v -../rtl/interfaces/VX_inst_exec_wb_if.v -../rtl/interfaces/VX_cache_dram_req_if.v -../rtl/interfaces/VX_cache_dram_rsp_if.v ../rtl/interfaces/VX_cache_core_req_if.v ../rtl/interfaces/VX_cache_core_rsp_if.v +../rtl/interfaces/VX_cache_dram_req_if.v +../rtl/interfaces/VX_cache_dram_rsp_if.v +../rtl/interfaces/VX_cache_snp_req_if.v +../rtl/interfaces/VX_csr_req_if.v +../rtl/interfaces/VX_exec_unit_req_if.v ../rtl/interfaces/VX_frE_to_bckE_req_if.v ../rtl/interfaces/VX_gpr_data_if.v -../rtl/interfaces/VX_csr_wb_if.v -../rtl/interfaces/VX_csr_req_if.v -../rtl/interfaces/VX_lsu_req_if.v -../rtl/interfaces/VX_cache_snp_req_rsp_if.v ../rtl/interfaces/VX_gpr_jal_if.v -../rtl/interfaces/VX_gpgpu_inst_req_if.v -../rtl/interfaces/VX_wstall_if.v -../rtl/interfaces/VX_wb_if.v ../rtl/interfaces/VX_gpr_read_if.v +../rtl/interfaces/VX_gpu_inst_req_if.v +../rtl/interfaces/VX_inst_meta_if.v ../rtl/interfaces/VX_jal_rsp_if.v +../rtl/interfaces/VX_join_if.v +../rtl/interfaces/VX_lsu_req_if.v ../rtl/interfaces/VX_warp_ctl_if.v -../rtl/interfaces/VX_inst_mem_wb_if.v +../rtl/interfaces/VX_wb_if.v +../rtl/interfaces/VX_wstall_if.v ../rtl/libs/VX_generic_register.v ../rtl/libs/VX_mult.v @@ -58,40 +69,28 @@ vortex_afu.json ../rtl/VX_warp.v ../rtl/VX_icache_stage.v ../rtl/VX_gpr_wrapper.v -../rtl/VX_gpgpu_inst.v +../rtl/VX_gpu_inst.v ../rtl/VX_writeback.v ../rtl/VX_csr_pipe.v +../rtl/VX_csr_data.v +../rtl/VX_csr_wrapper.v ../rtl/VX_warp_sched.v ../rtl/VX_gpr.v ../rtl/VX_gpr_ram.v ../rtl/VX_gpr_stage.v ../rtl/VX_dmem_ctrl.v ../rtl/VX_alu_unit.v -../rtl/VX_csr_data.v ../rtl/VX_lsu_unit.v ../rtl/VX_decode.v ../rtl/VX_inst_multiplex.v -../rtl/VX_csr_wrapper.v ../rtl/VX_lsu_addr_gen.v +../rtl/VX_dcache_io_arb.v +../rtl/VX_dram_arb.v ../rtl/pipe_regs/VX_f_d_reg.v ../rtl/pipe_regs/VX_i_d_reg.v ../rtl/pipe_regs/VX_d_e_reg.v -../rtl/cache/VX_snp_fwd_arb.v -../rtl/cache/VX_cache_dram_req_arb.v -../rtl/cache/VX_cache_dfq_queue.v -../rtl/cache/VX_cache_wb_sel_merge.v -../rtl/cache/VX_tag_data_access.v -../rtl/cache/VX_cache.v -../rtl/cache/VX_cache_core_req_bank_sel.v -../rtl/cache/VX_cache_req_queue.v -../rtl/cache/VX_bank.v -../rtl/cache/VX_cache_miss_resrv.v -../rtl/cache/VX_fill_invalidator.v -../rtl/cache/VX_tag_data_structure.v -../rtl/cache/VX_prefetcher.v - ccip_interface_reg.sv ccip_std_afu.sv vortex_afu.sv \ No newline at end of file diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 44590398..3fa56557 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -28,12 +28,14 @@ module vortex_afu #( output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select ); +localparam DRAM_ADDR_WIDTH = (32 - `CLOG2(`GLOBAL_BLOCK_SIZE)); + localparam AVS_RD_QUEUE_SIZE = 16; localparam CCI_RD_WINDOW_SIZE = 8; localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE; -localparam VX_SNOOP_DELAY = 300; +localparam VX_SNOOP_DELAY = 1000; localparam VX_SNOOP_LEVELS = 2; localparam AFU_ID_L = 16'h0002; // AFU ID Lower @@ -60,38 +62,39 @@ typedef enum logic[3:0] { STATE_CLFLUSH } state_t; -typedef logic [`LOG2UP(CCI_RD_WINDOW_SIZE)-1:0] t_cci_rdq_tag; +typedef logic [`LOG2UP(CCI_RD_WINDOW_SIZE)-1:0] t_cci_rdq_tag; typedef logic [$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:0] t_cci_rdq_data; state_t state; -// Vortex signals ///////////////////////////////////////////////////////////// +// Vortex ports /////////////////////////////////////////////////////////////// -logic vx_dram_req_read; -logic vx_dram_req_write; -logic [31:0] vx_dram_req_addr; +logic vx_dram_req_read; +logic vx_dram_req_write; +logic [DRAM_ADDR_WIDTH-1:0] vx_dram_req_addr; logic [`GLOBAL_BLOCK_SIZE-1:0] vx_dram_req_data; -logic vx_dram_req_ready; +logic [`L3DRAM_TAG_WIDTH-1:0] vx_dram_req_tag; +logic vx_dram_req_ready; -logic vx_dram_rsp_ready; -logic vx_dram_rsp_valid; -logic [31:0] vx_dram_rsp_addr; +logic vx_dram_rsp_valid; logic [`GLOBAL_BLOCK_SIZE-1:0] vx_dram_rsp_data; +logic [`L3DRAM_TAG_WIDTH-1:0] vx_dram_rsp_tag; +logic vx_dram_rsp_ready; -logic vx_snp_req; -logic [31:0] vx_snp_req_addr; -logic vx_snp_req_ready; +logic vx_snp_req_valid; +logic [DRAM_ADDR_WIDTH-1:0] vx_snp_req_addr; +logic vx_snp_req_ready; -logic vx_ebreak; +logic vx_busy; // AVS Queues ///////////////////////////////////////////////////////////////// -logic avs_raq_push; -t_local_mem_addr avs_raq_din; -logic avs_raq_pop; -t_local_mem_addr avs_raq_dout; -logic avs_raq_empty; -logic avs_raq_full; +logic avs_rtq_push; +t_local_mem_addr avs_rtq_din; +logic avs_rtq_pop; +t_local_mem_addr avs_rtq_dout; +logic avs_rtq_empty; +logic avs_rtq_full; logic avs_rdq_push; t_local_mem_data avs_rdq_din; @@ -105,7 +108,7 @@ logic avs_rdq_full; logic [2:0] csr_cmd; t_ccip_clAddr csr_io_addr; t_local_mem_addr csr_mem_addr; -logic [31:0] csr_data_size; +logic [DRAM_ADDR_WIDTH-1:0] csr_data_size; // MMIO controller //////////////////////////////////////////////////////////// @@ -137,16 +140,16 @@ begin begin case (mmioHdr.address) MMIO_CSR_IO_ADDR: begin - csr_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data >> 6); - $display("%t: CSR_IO_ADDR: 0x%h", $time, t_ccip_clAddr'(cp2af_sRxPort.c0.data >> 6)); + csr_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data); + $display("%t: CSR_IO_ADDR: 0x%h", $time, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); end MMIO_CSR_MEM_ADDR: begin - csr_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data >> 6); - $display("%t: CSR_MEM_ADDR: 0x%h", $time, t_local_mem_addr'(cp2af_sRxPort.c0.data >> 6)); + csr_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data); + $display("%t: CSR_MEM_ADDR: 0x%h", $time, t_local_mem_addr'(cp2af_sRxPort.c0.data)); end MMIO_CSR_DATA_SIZE: begin - csr_data_size <= $bits(csr_data_size)'((cp2af_sRxPort.c0.data + 63) >> 6); - $display("%t: CSR_DATA_SIZE: %0d", $time, $bits(csr_data_size)'((cp2af_sRxPort.c0.data + 63) >> 6)); + csr_data_size <= $bits(csr_data_size)'(cp2af_sRxPort.c0.data); + $display("%t: CSR_DATA_SIZE: %0d", $time, $bits(csr_data_size)'(cp2af_sRxPort.c0.data)); end MMIO_CSR_CMD: begin csr_cmd <= $bits(csr_cmd)'(cp2af_sRxPort.c0.data); @@ -195,12 +198,12 @@ end // COMMAND FSM //////////////////////////////////////////////////////////////// -logic [31:0] cci_write_ctr; -logic [31:0] avs_read_ctr; -logic [31:0] avs_write_ctr; -logic [31:0] vx_snoop_ctr; -logic [9:0] vx_snoop_delay; -logic vx_reset; +logic [DRAM_ADDR_WIDTH-1:0] cci_write_ctr; +logic [DRAM_ADDR_WIDTH-1:0] avs_read_ctr; +logic [DRAM_ADDR_WIDTH-1:0] avs_write_ctr; +logic [DRAM_ADDR_WIDTH-1:0] snp_req_ctr; +logic [9:0] snp_req_delay; +logic vx_reset; always_ff @(posedge clk) begin @@ -248,13 +251,13 @@ begin end STATE_RUN: begin - if (vx_ebreak) begin + if (!vx_busy) begin state <= STATE_IDLE; end end STATE_CLFLUSH: begin - if (vx_snoop_delay >= VX_SNOOP_DELAY) begin + if (snp_req_delay >= VX_SNOOP_DELAY) begin state <= STATE_IDLE; end end @@ -268,11 +271,12 @@ end logic cci_rdq_empty; t_cci_rdq_data cci_rdq_dout; logic cci_rdq_pop; +logic [`L3DRAM_TAG_WIDTH-1:0] dram_req_tag; t_ccip_clAddr next_avs_address; always_comb begin - next_avs_address = csr_mem_addr + {avs_write_ctr[31:$bits(t_cci_rdq_tag)], t_cci_rdq_tag'(cci_rdq_dout)}; + next_avs_address = csr_mem_addr + {avs_write_ctr[DRAM_ADDR_WIDTH-1:$bits(t_cci_rdq_tag)], t_cci_rdq_tag'(cci_rdq_dout)}; cci_rdq_pop = (state == STATE_WRITE && !cci_rdq_empty && !avs_waitrequest @@ -285,9 +289,7 @@ begin begin mem_bank_select <= 0; avs_burstcount <= 1; - avs_byteenable <= 64'hffffffffffffffff; - avs_address <= 0; - avs_writedata <= 0; + avs_byteenable <= 64'hffffffffffffffff; avs_read <= 0; avs_write <= 0; avs_read_ctr <= 0; @@ -305,7 +307,7 @@ begin end STATE_READ: begin - if (!avs_raq_full + if (!avs_rtq_full && !avs_rdq_full && !avs_waitrequest && avs_read_ctr < csr_data_size) @@ -332,18 +334,19 @@ begin if (vx_dram_req_read && vx_dram_req_ready) begin - avs_address <= (vx_dram_req_addr >> 6); - avs_read <= 1; - $display("%t: AVS Rd Req: addr=%h", $time, vx_dram_req_addr >> 6); + avs_address <= vx_dram_req_addr; + dram_req_tag <= vx_dram_req_tag; + avs_read <= 1; + $display("%t: AVS Rd Req: addr=%h", $time, vx_dram_req_addr); end if (vx_dram_req_write && vx_dram_req_ready) begin - avs_writedata <= vx_dram_req_data; - avs_address <= (vx_dram_req_addr >> 6); + avs_address <= vx_dram_req_addr; + avs_writedata <= vx_dram_req_data; avs_write <= 1; - $display("%t: AVS Wr Req: addr=%h", $time, vx_dram_req_addr >> 6); + $display("%t: AVS Wr Req: addr=%h", $time, vx_dram_req_addr); end end endcase @@ -362,7 +365,7 @@ logic vortex_enabled; always_comb begin vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state); - vx_dram_req_ready = vortex_enabled && !avs_waitrequest && !avs_raq_full && !avs_rdq_full; + vx_dram_req_ready = vortex_enabled && !avs_waitrequest && !avs_rtq_full && !avs_rdq_full; end // Vortex DRAM fill response @@ -370,7 +373,7 @@ end always_comb begin vx_dram_rsp_valid = vortex_enabled && !avs_rdq_empty && vx_dram_rsp_ready; - vx_dram_rsp_addr = (avs_raq_dout << 6); + vx_dram_rsp_tag = avs_rtq_dout; vx_dram_rsp_data = avs_rdq_dout; end @@ -380,9 +383,9 @@ logic cci_wr_req; always_comb begin - avs_raq_pop = vx_dram_rsp_valid || cci_wr_req; - avs_raq_din = avs_address; - avs_raq_push = avs_read; + avs_rtq_pop = vx_dram_rsp_valid || cci_wr_req; + avs_rtq_din = dram_req_tag; + avs_rtq_push = avs_read; end VX_generic_queue #( @@ -391,19 +394,19 @@ VX_generic_queue #( ) avs_rd_req_queue ( .clk (clk), .reset (SoftReset), - .push (avs_raq_push), - .data_in (avs_raq_din), - .pop (avs_raq_pop), - .data_out (avs_raq_dout), - .empty (avs_raq_empty), - .full (avs_raq_full) + .push (avs_rtq_push), + .data_in (avs_rtq_din), + .pop (avs_rtq_pop), + .data_out (avs_rtq_dout), + .empty (avs_rtq_empty), + .full (avs_rtq_full) ); // AVS data read response queue /////////////////////////////////////////////// always_comb begin - avs_rdq_pop = avs_raq_pop; + avs_rdq_pop = avs_rtq_pop; avs_rdq_din = avs_readdata; avs_rdq_push = avs_readdatavalid; end @@ -426,7 +429,7 @@ VX_generic_queue #( t_ccip_c0_ReqMemHdr cci_read_hdr; -logic [31:0] cci_read_ctr; +logic [DRAM_ADDR_WIDTH-1:0] cci_read_ctr; t_cci_rdq_tag cci_rdq_ctr; logic cci_rdq_full; @@ -562,29 +565,29 @@ end always_ff @(posedge clk) begin if (SoftReset) begin - vx_snp_req <= 0; - vx_snoop_ctr <= 0; - vx_snoop_delay <= 0; + vx_snp_req_valid <= 0; + snp_req_ctr <= 0; + snp_req_delay <= 0; end else begin if (STATE_IDLE == state) begin - vx_snoop_ctr <= 0; - vx_snoop_delay <= 0; + snp_req_ctr <= 0; + snp_req_delay <= 0; end - vx_snp_req <= 0; + vx_snp_req_valid <= 0; if ((STATE_CLFLUSH == state) - && vx_snoop_ctr < csr_data_size + && (snp_req_ctr < csr_data_size) && vx_snp_req_ready) - begin - vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6; - vx_snp_req <= 1; - vx_snoop_ctr <= vx_snoop_ctr + 1; + begin + vx_snp_req_addr <= csr_mem_addr + snp_req_ctr; + vx_snp_req_valid <= 1; + snp_req_ctr <= snp_req_ctr + 1; end - if (vx_snoop_ctr == csr_data_size) begin - vx_snoop_delay <= vx_snoop_delay + 1; + if (snp_req_ctr == csr_data_size) begin + snp_req_delay <= snp_req_delay + 1; end end end @@ -600,21 +603,22 @@ Vortex_Socket #() vx_socket ( .dram_req_read (vx_dram_req_read), .dram_req_addr (vx_dram_req_addr), .dram_req_data (vx_dram_req_data), + .dram_req_tag (vx_dram_req_tag), .dram_req_ready (vx_dram_req_ready), // DRAM Rsp .dram_rsp_valid (vx_dram_rsp_valid), - .dram_rsp_addr (vx_dram_rsp_addr), .dram_rsp_data (vx_dram_rsp_data), + .dram_rsp_tag (vx_dram_rsp_tag), .dram_rsp_ready (vx_dram_rsp_ready), // Cache Snooping Req - .llc_snp_req_valid (vx_snp_req), - .llc_snp_req_addr (vx_snp_req_addr), - .llc_snp_req_ready (vx_snp_req_ready), + .snp_req_valid (vx_snp_req_valid), + .snp_req_addr (vx_snp_req_addr), + .snp_req_ready (vx_snp_req_ready), - // program exit signal - .ebreak (vx_ebreak) + // status + .busy (vx_busy) ); endmodule diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index ac39b969..266e75a4 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -52,6 +52,16 @@ /////////////////////////////////////////////////////////////////////////////// +`define BYTE_EN_NO 3'h7 +`define BYTE_EN_LB 3'h0 +`define BYTE_EN_LH 3'h1 +`define BYTE_EN_LW 3'h2 +`define BYTE_EN_HB 3'h4 +`define BYTE_EN_HH 3'h5 +`define BYTE_EN_BITS 3 + +/////////////////////////////////////////////////////////////////////////////// + `define INST_R 7'd051 `define INST_L 7'd003 `define INST_ALU 7'd019 @@ -62,7 +72,7 @@ `define INST_JAL 7'd111 `define INST_JALR 7'd103 `define INST_SYS 7'd115 -`define INST_GPGPU 7'h06b +`define INST_GPGPU 7'd107 `define RS2_IMMED 1 `define RS2_REG 0 diff --git a/hw/rtl/VX_dram_arb.v b/hw/rtl/VX_dram_arb.v index 22a42df5..5ebbbd6e 100644 --- a/hw/rtl/VX_dram_arb.v +++ b/hw/rtl/VX_dram_arb.v @@ -1,4 +1,4 @@ -`include "VX_define.vh" +`include "VX_cache_config.vh" module VX_dram_arb #( parameter BANK_LINE_SIZE = 1, diff --git a/hw/rtl/VX_warp.v b/hw/rtl/VX_warp.v index 2433d950..53e4e549 100644 --- a/hw/rtl/VX_warp.v +++ b/hw/rtl/VX_warp.v @@ -19,26 +19,16 @@ module VX_warp ( output wire[`NUM_THREADS-1:0] valid ); - reg [31:0] real_PC; - logic [31:0] temp_PC; - logic [31:0] use_PC; reg [`NUM_THREADS-1:0] valid_t; - reg [`NUM_THREADS-1:0] valid_zero; - - integer i; - initial begin - real_PC = 0; - for (i = 1; i < `NUM_THREADS; i=i+1) begin - valid_t[i] = 0; // Thread 1 active - valid_zero[i] = 0; - end - valid_t = 1; - valid_zero[0] = 0; - end + reg [31:0] real_PC; + reg [31:0] temp_PC; + reg [31:0] use_PC; always @(posedge clk) begin - if (remove) begin - valid_t <= valid_zero; + if (reset) begin + valid_t <= {{(`NUM_THREADS-1){1'b0}},1'b1}; // Thread 1 active + end else if (remove) begin + valid_t <= 0; end else if (change_mask) begin valid_t <= thread_mask; end @@ -46,7 +36,7 @@ module VX_warp ( genvar i; generate - for (i = 0; i < `NUM_THREADS; i = i+1) begin : valid_assign + for (i = 0; i < `NUM_THREADS; i++) begin : valid_assign assign valid[i] = change_mask ? thread_mask[i] : stall ? 1'b0 : valid_t[i]; end endgenerate @@ -54,8 +44,7 @@ module VX_warp ( always @(*) begin if (jal == 1'b1) begin temp_PC = jal_dest; - // $display("LINKING TO %h", temp_PC); - end else if (branch_dir == 1'b1) begin + end else if (branch_dir) begin temp_PC = branch_dest; end else begin temp_PC = real_PC; @@ -68,8 +57,7 @@ module VX_warp ( always @(posedge clk) begin if (reset) begin real_PC <= 0; - end else if (wspawn == 1'b1) begin - // $display("Inside warp ***** Spawn @ %H",wspawn_pc); + end else if (wspawn) begin real_PC <= wspawn_pc; end else if (!stall) begin real_PC <= use_PC + 32'h4; diff --git a/hw/rtl/cache/VX_cache_config.vh b/hw/rtl/cache/VX_cache_config.vh index 5cc6d422..f450ee84 100644 --- a/hw/rtl/cache/VX_cache_config.vh +++ b/hw/rtl/cache/VX_cache_config.vh @@ -3,14 +3,6 @@ `include "VX_define.vh" -`define BYTE_EN_NO 3'h7 -`define BYTE_EN_LB 3'h0 -`define BYTE_EN_LH 3'h1 -`define BYTE_EN_LW 3'h2 -`define BYTE_EN_HB 3'h4 -`define BYTE_EN_HH 3'h5 -`define BYTE_EN_BITS 3 - // data tid tag read write base addr `define MRVQ_METADATA_WIDTH (`WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH + `BYTE_EN_BITS + `BYTE_EN_BITS + `BASE_ADDR_BITS) diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index acaf1d1c..47baeb3a 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -53,9 +53,9 @@ module VX_cache_core_rsp_merge #( output wire [NUM_BANKS-1:0] per_bank_core_rsp_pop, // Core Writeback - output reg [NUM_REQUESTS-1:0] core_rsp_valid, - output reg [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] core_rsp_data, - output reg [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, + output reg [NUM_REQUESTS-1:0] core_rsp_valid, + output reg [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] core_rsp_data, + output reg [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, input wire core_rsp_ready ); @@ -81,7 +81,6 @@ module VX_cache_core_rsp_merge #( always @(*) begin core_rsp_valid = 0; core_rsp_data = 0; - core_rsp_tag = 0; for (i = 0; i < NUM_BANKS; i = i + 1) begin if (found_bank && per_bank_core_rsp_valid[i] diff --git a/hw/rtl/libs/VX_divide.v b/hw/rtl/libs/VX_divide.v index 1098fecf..5f338589 100644 --- a/hw/rtl/libs/VX_divide.v +++ b/hw/rtl/libs/VX_divide.v @@ -66,7 +66,7 @@ module VX_divide #( reg [WIDTHD-1:0] denom_pipe [0:PIPELINE-1]; genvar i; - for (i = 0; i < PIPELINE-1; i = i+1) begin : pipe_stages + for (i = 0; i < PIPELINE-1; i++) begin : pipe_stages always @(posedge clock or posedge aclr) begin if (aclr) begin numer_pipe[i+1] <= 0; diff --git a/hw/rtl/libs/VX_mult.v b/hw/rtl/libs/VX_mult.v index 3d689bfa..287d7116 100644 --- a/hw/rtl/libs/VX_mult.v +++ b/hw/rtl/libs/VX_mult.v @@ -84,7 +84,7 @@ module VX_mult #( reg [WIDTHB-1:0] datab_pipe [0:PIPELINE-1]; genvar i; - for (i = 0; i < PIPELINE-1; i = i+1) begin : pipe_stages + for (i = 0; i < PIPELINE-1; i++) begin : pipe_stages always @(posedge clock or posedge aclr) begin if (aclr) begin dataa_pipe[i+1] <= 0; diff --git a/hw/unit_tests/VX_divide_tb.v b/hw/unit_tests/VX_divide_tb.v index 08342eac..1aa85c7c 100644 --- a/hw/unit_tests/VX_divide_tb.v +++ b/hw/unit_tests/VX_divide_tb.v @@ -19,7 +19,7 @@ module VX_tb_divide(); genvar i; generate - for (i = 0; i < 8; i = i+1) begin : div_loop + for (i = 0; i < 8; i++) begin : div_loop VX_divide#( .WIDTHN(32), .WIDTHD(32),