diff --git a/Makefile b/Makefile index 84235936..410dc008 100644 --- a/Makefile +++ b/Makefile @@ -4,10 +4,12 @@ all: $(MAKE) -C driver $(MAKE) -C runtime $(MAKE) -C simX + $(MAKE) -C benchmarks/opencl clean: $(MAKE) -C hw clean $(MAKE) -C driver clean $(MAKE) -C simX clean $(MAKE) -C runtime clean + $(MAKE) -C benchmarks/opencl clean diff --git a/benchmarks/opencl/sgemm/main.cc b/benchmarks/opencl/sgemm/main.cc index 4678a7a7..81775741 100644 --- a/benchmarks/opencl/sgemm/main.cc +++ b/benchmarks/opencl/sgemm/main.cc @@ -106,7 +106,7 @@ int main (int argc, char **argv) { size_t kernel_size; cl_int binary_status; - srand(time(NULL)); + srand(50); // read kernel binary from file if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) diff --git a/benchmarks/opencl/sgemm/sgemm b/benchmarks/opencl/sgemm/sgemm deleted file mode 100755 index d8d18df9..00000000 Binary files a/benchmarks/opencl/sgemm/sgemm and /dev/null differ diff --git a/driver/opae/Makefile b/driver/opae/Makefile index a2ebdd05..17572e34 100644 --- a/driver/opae/Makefile +++ b/driver/opae/Makefile @@ -1,11 +1,14 @@ OPAE_HOME ?= /tools/opae/1.4.0 +#CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I../include -I$(OPAE_HOME)/include -I../../hw LDFLAGS += -L$(OPAE_HOME)/lib +#SCOPE=1 + # stack execution protection LDFLAGS +=-z noexecstack @@ -21,9 +24,6 @@ CXXFLAGS += -fPIC # Dump perf stats CXXFLAGS += -DDUMP_PERF_STATS -# Enable scope analyzer -#CXXFLAGS += -DSCOPE - LDFLAGS += -shared FPGA_LIBS += -luuid -lopae-c @@ -32,8 +32,6 @@ ASE_LIBS += -luuid -lopae-c-ase VLSIM_LIBS += -lopae-c-vlsim -LIB_DIR=../lib - ASE_DIR = ase VLSIM_DIR = vlsim @@ -46,7 +44,14 @@ PROJECT_VLSIM = $(VLSIM_DIR)/libvortex.so AFU_JSON_INFO = vortex_afu.h -SRCS = vortex.cpp vx_scope.cpp ../common/vx_utils.cpp +SRCS = vortex.cpp ../common/vx_utils.cpp + +# Enable scope analyzer +ifdef SCOPE + CXXFLAGS += -DSCOPE + SRCS += vx_scope.cpp + SET_SCOPE = SCOPE=1 +endif all: vlsim @@ -57,14 +62,14 @@ json: ../../hw/opae/vortex_afu.json fpga: $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) $(FPGA_LIBS) -o $(PROJECT) -ase: $(SRCS) $(ASE_DIR) +asesim: $(SRCS) $(ASE_DIR) $(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $(PROJECT_ASE) -vlsim: $(SRCS) opae-vlsim - $(CXX) $(CXXFLAGS) -L./vlsim -DUSE_VLSIM $(SRCS) $(LDFLAGS) $(VLSIM_LIBS) -o $(PROJECT_VLSIM) +vlsim: $(SRCS) vlsim-hw + $(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -L./vlsim $(VLSIM_LIBS) -o $(PROJECT_VLSIM) -opae-vlsim: - $(MAKE) -C vlsim +vlsim-hw: + $(SET_SCOPE) $(MAKE) -C vlsim vortex.o: vortex.cpp $(CXX) $(CXXFLAGS) -c vortex.cpp -o $@ diff --git a/driver/opae/vlsim/.gitignore b/driver/opae/vlsim/.gitignore new file mode 100644 index 00000000..541b1f36 --- /dev/null +++ b/driver/opae/vlsim/.gitignore @@ -0,0 +1 @@ +/obj_dir/* \ No newline at end of file diff --git a/driver/opae/vlsim/Makefile b/driver/opae/vlsim/Makefile index 0cae2dfd..d0f17edc 100644 --- a/driver/opae/vlsim/Makefile +++ b/driver/opae/vlsim/Makefile @@ -12,6 +12,8 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSRQ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE +DBG_PRINT_FLAGS += -DDBG_CORE_REQ_INFO +DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO @@ -22,6 +24,7 @@ CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1 #DEBUG=1 +#SCOPE=1 CFLAGS += -fPIC @@ -34,7 +37,9 @@ LDFLAGS += -shared -pthread TOP = vortex_afu_shim -RTL_DIR = ../../../hw/rtl +RTL_DIR=../../../hw/rtl + +SCRIPT_DIR=../../../hw/scripts SRCS = fpga.cpp opae_sim.cpp SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp @@ -42,7 +47,7 @@ SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) +VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) VL_FLAGS += -Wno-DECLFILENAME VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += verilator.vlt @@ -53,29 +58,42 @@ VL_FLAGS += verilator.vlt # Debugigng ifdef DEBUG - VL_FLAGS += -DVCD_OUTPUT --assert --trace $(DBG_FLAGS) + VL_FLAGS += -DVCD_OUTPUT --assert --trace-fst --trace-threads 1 $(DBG_FLAGS) CFLAGS += -DVCD_OUTPUT $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CFLAGS += -DNDEBUG endif +# Enable scope analyzer +ifdef SCOPE + VL_FLAGS += -DSCOPE + CFLAGS += -DSCOPE + SCOPE_VH = $(RTL_DIR)/scope-defs.vh +endif + +# use our OPAE shim VL_FLAGS += -DNOPAE CFLAGS += -DNOPAE -# Enable scope analyzer -#VL_FLAGS += -DSCOPE -#CFLAGS += -DSCOPE +# use DPI FPU +#VL_FLAGS += -DFPU_FAST RTL_INCLUDE += -I../../../hw/opae -I../../../hw/opae/ccip PROJECT = libopae-c-vlsim.so all: $(PROJECT) + +# generate scope data +scope: $(RTL_DIR)/scope-defs.vh + +$(RTL_DIR)/scope-defs.vh: $(SCRIPT_DIR)/scope.json + $(SCRIPT_DIR)/scope.py $(RTL_INCLUDE) $(CONFIGS) -cc ../scope-defs.h -vl $(RTL_DIR)/scope-defs.vh $(SCRIPT_DIR)/scope.json -$(PROJECT): $(SRCS) +$(PROJECT): $(SRCS) $(SCOPE_VH) verilator --exe --cc $(TOP) --top-module $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT) OPT_FAST="-O0 -g" make -j -C obj_dir -f V$(TOP).mk clean: - rm -rf $(PROJECT) obj_dir + rm -rf $(PROJECT) obj_dir ../scope-defs.h $(RTL_DIR)/scope-defs.vh diff --git a/driver/opae/vlsim/opae_sim.cpp b/driver/opae/vlsim/opae_sim.cpp index dcfbd9f2..dfddb482 100644 --- a/driver/opae/vlsim/opae_sim.cpp +++ b/driver/opae/vlsim/opae_sim.cpp @@ -31,9 +31,9 @@ opae_sim::opae_sim() { #ifdef VCD_OUTPUT Verilated::traceEverOn(true); - trace_ = new VerilatedVcdC(); + trace_ = new VerilatedFstC(); vortex_afu_->trace(trace_, 99); - trace_->open("trace.vcd"); + trace_->open("trace.fst"); #endif this->reset(); @@ -85,6 +85,19 @@ void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) { *ioaddr = host_buffers_[wsid].ioaddr; } +void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { + std::lock_guard guard(mutex_); + + vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid = 1; + vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; + vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; + vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; + this->step(); + vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid = 0; + assert(vortex_afu_->af2cp_sTxPort_c2_mmioRdValid); + *value = vortex_afu_->af2cp_sTxPort_c2_data; +} + void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { std::lock_guard guard(mutex_); @@ -94,20 +107,7 @@ void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, &value, 8); this->step(); - assert(!vortex_afu_->vcp2af_sRxPort_c0_mmioWrValid); -} - -void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { - std::lock_guard guard(mutex_); - - vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid = 1; - vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; - vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; - vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; - this->step(); - assert(!vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid); - assert(vortex_afu_->af2cp_sTxPort_c2_mmioRdValid); - *value = vortex_afu_->af2cp_sTxPort_c2_data; + vortex_afu_->vcp2af_sRxPort_c0_mmioWrValid = 0; } void opae_sim::flush() { @@ -117,24 +117,45 @@ void opae_sim::flush() { /////////////////////////////////////////////////////////////////////////////// void opae_sim::reset() { - vortex_afu_->reset = 1; - this->step(); - vortex_afu_->reset = 0; + + host_buffers_.clear(); + dram_reads_.clear(); + cci_reads_.clear(); + cci_writes_.clear(); + vortex_afu_->vcp2af_sRxPort_c0_rspValid = 0; + vortex_afu_->vcp2af_sRxPort_c1_rspValid = 0; + vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull = 0; + vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull = 0; + vortex_afu_->avs_readdatavalid = 0; + vortex_afu_->avs_waitrequest = 0; + vortex_afu_->reset = 1; + + vortex_afu_->clk = 0; + this->eval(); + vortex_afu_->clk = 1; + this->eval(); + + vortex_afu_->reset = 0; + // Turn on assertion after reset Verilated::assertOn(true); } void opae_sim::step() { - vortex_afu_->clk = 0; - this->eval(); - - vortex_afu_->clk = 1; - this->eval(); this->sRxPort_bus(); this->sTxPort_bus(); this->avs_bus(); + + vortex_afu_->clk = 0; + this->eval(); + vortex_afu_->clk = 1; + this->eval(); + +#ifndef NDEBUG + fflush(stdout); +#endif } void opae_sim::eval() { @@ -145,100 +166,104 @@ void opae_sim::eval() { ++timestamp; } -void opae_sim::sRxPort_bus() { +void opae_sim::sRxPort_bus() { + // check mmio request + bool mmio_req_enabled = vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid + || vortex_afu_->vcp2af_sRxPort_c0_mmioWrValid; + // schedule CCI read responses - int cci_rd_index = -1; - for (int i = 0; i < cci_reads_.size(); i++) { - if (cci_reads_[i].cycles_left > 0) { - cci_reads_[i].cycles_left -= 1; - } - if ((cci_rd_index == -1) - && (cci_reads_[i].cycles_left == 0)) { - cci_rd_index = i; + std::list::iterator cci_rd_it(cci_reads_.end()); + for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) { + if (it->cycles_left > 0) + it->cycles_left -= 1; + if ((cci_rd_it == ie) && (it->cycles_left == 0)) { + cci_rd_it = it; } } // schedule CCI write responses - int cci_wr_index = -1; - for (int i = 0; i < cci_writes_.size(); i++) { - if (cci_writes_[i].cycles_left > 0) { - cci_writes_[i].cycles_left -= 1; + std::list::iterator cci_wr_it(cci_writes_.end()); + for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) { + if (it->cycles_left > 0) + it->cycles_left -= 1; + if ((cci_wr_it == ie) && (it->cycles_left == 0)) { + cci_wr_it = it; } - if ((cci_wr_index == -1) - && (cci_writes_[i].cycles_left == 0)) { - cci_wr_index = i; - } - } - - // send CCI read response - vortex_afu_->vcp2af_sRxPort_c0_rspValid = 0; - if (cci_rd_index != -1) { - vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1; - memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_reads_[cci_rd_index].block.data(), CACHE_BLOCK_SIZE); - vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_reads_[cci_rd_index].mdata; - cci_reads_.erase(cci_reads_.begin() + cci_rd_index); } // send CCI write response vortex_afu_->vcp2af_sRxPort_c1_rspValid = 0; - if (cci_wr_index != -1) { + if (cci_wr_it != cci_writes_.end()) { vortex_afu_->vcp2af_sRxPort_c1_rspValid = 1; - vortex_afu_->vcp2af_sRxPort_c1_hdr_mdata = cci_writes_[cci_wr_index].mdata; - cci_writes_.erase(cci_writes_.begin() + cci_wr_index); + vortex_afu_->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata; + cci_writes_.erase(cci_wr_it); } - // mmio - vortex_afu_->vcp2af_sRxPort_c0_mmioWrValid = 0; - vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid = 0; + // send CCI read response (ensure mmio disabled) + vortex_afu_->vcp2af_sRxPort_c0_rspValid = 0; + if (!mmio_req_enabled + && (cci_rd_it != cci_reads_.end())) { + vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1; + memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->block.data(), CACHE_BLOCK_SIZE); + vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; + /*printf("*** [vlsim] read-rsp: addr=%ld, mdata=%d, data=", cci_rd_it->addr, cci_rd_it->mdata); + for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) + printf("%02x", cci_rd_it->block[CACHE_BLOCK_SIZE-1-i]); + printf("\n");*/ + fflush(stdout); + cci_reads_.erase(cci_rd_it); + } } void opae_sim::sTxPort_bus() { - // check read queue size - vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= CCI_RQ_SIZE); - - // check write queue size - vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= CCI_WQ_SIZE); - // process read requests - if (vortex_afu_->af2cp_sTxPort_c0_valid && !vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull) { + if (vortex_afu_->af2cp_sTxPort_c0_valid) { + assert(!vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull); cci_rd_req_t cci_req; cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); + cci_req.addr = vortex_afu_->af2cp_sTxPort_c0_hdr_address; cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata; auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); memcpy(cci_req.block.data(), host_ptr, CACHE_BLOCK_SIZE); - cci_reads_.push_back(cci_req); + //printf("*** [vlsim] read-req: addr=%ld, mdata=%d\n", vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); + fflush(stdout); + cci_reads_.emplace_back(cci_req); } // process write requests - if (vortex_afu_->af2cp_sTxPort_c1_valid && !vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull) { + if (vortex_afu_->af2cp_sTxPort_c1_valid) { + assert(!vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull); cci_wr_req_t cci_req; cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); cci_req.mdata = vortex_afu_->af2cp_sTxPort_c1_hdr_mdata; auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE); memcpy(host_ptr, vortex_afu_->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE); - cci_writes_.push_back(cci_req); + cci_writes_.emplace_back(cci_req); } + + // check queues overflow + vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1)); + vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1)); } void opae_sim::avs_bus() { // schedule DRAM read responses - int dram_rd_index = -1; - for (int i = 0; i < dram_reads_.size(); i++) { - if (dram_reads_[i].cycles_left > 0) { - dram_reads_[i].cycles_left -= 1; + std::list::iterator dram_rd_it(dram_reads_.end()); + for (auto it = dram_reads_.begin(), ie = dram_reads_.end(); it != ie; ++it) { + if (it->cycles_left > 0) { + it->cycles_left -= 1; } - if ((dram_rd_index == -1) - && (dram_reads_[i].cycles_left == 0)) { - dram_rd_index = i; + if ((it != ie) && (it->cycles_left == 0)) { + dram_rd_it = it; } } // send DRAM response vortex_afu_->avs_readdatavalid = 0; - if (dram_rd_index != -1) { + if (dram_rd_it != dram_reads_.end()) { vortex_afu_->avs_readdatavalid = 1; - memcpy(vortex_afu_->avs_readdata, dram_reads_[dram_rd_index].block.data(), CACHE_BLOCK_SIZE); - dram_reads_.erase(dram_reads_.begin() + dram_rd_index); + memcpy(vortex_afu_->avs_readdata, dram_rd_it->block.data(), CACHE_BLOCK_SIZE); + dram_reads_.erase(dram_rd_it); } // handle DRAM stalls @@ -271,7 +296,7 @@ void opae_sim::avs_bus() { dram_req.cycles_left = DRAM_LATENCY; unsigned base_addr = (vortex_afu_->avs_address * CACHE_BLOCK_SIZE); ram_.read(base_addr, CACHE_BLOCK_SIZE, dram_req.block.data()); - dram_reads_.push_back(dram_req); + dram_reads_.emplace_back(dram_req); } } diff --git a/driver/opae/vlsim/opae_sim.h b/driver/opae/vlsim/opae_sim.h index 9a4906eb..58b57757 100644 --- a/driver/opae/vlsim/opae_sim.h +++ b/driver/opae/vlsim/opae_sim.h @@ -5,7 +5,7 @@ #include "verilated.h" #ifdef VCD_OUTPUT -#include +#include #endif #include @@ -13,7 +13,7 @@ #include #include -#include +#include #include #define CACHE_BLOCK_SIZE 64 @@ -41,18 +41,19 @@ private: typedef struct { int cycles_left; std::array block; - unsigned tag; + uint32_t tag; } dram_rd_req_t; typedef struct { int cycles_left; std::array block; - unsigned mdata; + uint64_t addr; + uint32_t mdata; } cci_rd_req_t; typedef struct { int cycles_left; - unsigned mdata; + uint32_t mdata; } cci_wr_req_t; typedef struct { @@ -76,17 +77,17 @@ private: std::unordered_map host_buffers_; - std::vector dram_reads_; + std::list dram_reads_; - std::vector cci_reads_; + std::list cci_reads_; - std::vector cci_writes_; + std::list cci_writes_; std::mutex mutex_; RAM ram_; Vvortex_afu_shim *vortex_afu_; #ifdef VCD_OUTPUT - VerilatedVcdC *trace_; + VerilatedFstC *trace_; #endif }; \ No newline at end of file diff --git a/driver/opae/vlsim/vortex_afu_shim.sv b/driver/opae/vlsim/vortex_afu_shim.sv index 4977979a..cf5735e4 100644 --- a/driver/opae/vlsim/vortex_afu_shim.sv +++ b/driver/opae/vlsim/vortex_afu_shim.sv @@ -87,7 +87,7 @@ t_if_ccip_Tx af2cp_sTxPort; vortex_afu #( .NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS) -) vortex_afu ( +) afu ( .clk(clk), .reset(reset), .cp2af_sRxPort(cp2af_sRxPort), diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index 41e69baa..bc57aa39 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -31,7 +31,7 @@ fpga_result res = _expr; \ if (res == FPGA_OK) \ break; \ - printf("OPAE Error: '%s' returned %d, %s!\n", \ + printf("[VXDRV] Error: '%s' returned %d, %s!\n", \ #_expr, (int)res, fpgaErrStr(res)); \ return -1; \ } while (false) @@ -118,7 +118,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { *value = STARTUP_ADDR; break; default: - fprintf(stderr, "invalid caps id: %d\n", caps_id); + fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id); std::abort(); return -1; } @@ -156,7 +156,7 @@ extern int vx_dev_open(vx_device_h* hdevice) { fpgaDestroyProperties(&filter); if (num_matches < 1) { - fprintf(stderr, "Accelerator %s not found!\n", AFU_ACCEL_UUID); + fprintf(stderr, "[VXDRV] Error: accelerator %s not found!\n", AFU_ACCEL_UUID); return -1; } @@ -197,9 +197,10 @@ extern int vx_dev_open(vx_device_h* hdevice) { fpgaClose(accel_handle); return ret; } - - fprintf(stdout, "DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n", + #ifndef NDEBUG + fprintf(stdout, "[VXDRV] DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n", device->implementation_id, device->num_cores, device->num_warps, device->num_threads); + #endif } #ifdef SCOPE @@ -236,18 +237,18 @@ extern int vx_dev_close(vx_device_h hdevice) { int ret = vx_get_perf(hdevice, core_id, &instrs, &cycles); assert(ret == 0); float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); + fprintf(stdout, "[VXDRV] PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); total_instrs += instrs; total_cycles = std::max(total_cycles, cycles); } float IPC = (float)(double(total_instrs) / double(total_cycles)); - fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); + fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); } else { uint64_t instrs, cycles; int ret = vx_get_perf(hdevice, 0, &instrs, &cycles); float IPC = (float)(double(instrs) / double(cycles)); assert(ret == 0); - fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); + fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); } #endif @@ -373,7 +374,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &data)); if (0 == data || 0 == timeout) { if (data != 0) { - fprintf(stdout, "ready-wait timed out: status=%ld\n", data); + fprintf(stdout, "[VXDRV] ready-wait timed out: status=%ld\n", data); } break; } @@ -509,12 +510,6 @@ extern int vx_start(vx_device_h hdevice) { // start execution CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN)); -#ifdef SCOPE - sleep(15); - vx_scope_stop(device->fpga, 0); - exit(0); -#endif - return 0; } @@ -547,7 +542,7 @@ extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* valu // Ensure ready for new command if (vx_ready_wait(hdevice, -1) != 0) - return -1; + return -1; // write CSR value CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core_id)); diff --git a/driver/opae/vx_scope.cpp b/driver/opae/vx_scope.cpp index 41709e37..55cce38f 100644 --- a/driver/opae/vx_scope.cpp +++ b/driver/opae/vx_scope.cpp @@ -4,6 +4,9 @@ #include #include #include +#include +#include +#include #ifdef USE_VLSIM #include "vlsim/fpga.h" @@ -14,6 +17,9 @@ #include #include "vx_scope.h" #include "vortex_afu.h" +#include "scope-defs.h" + +#define SCOPE_FRAME_WIDTH 1768 #define CHECK_RES(_expr) \ do { \ @@ -28,140 +34,72 @@ #define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4) #define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4) -struct scope_signal_t { - int width; - const char* name; -}; +#define CMD_GET_VALID 0 +#define CMD_GET_DATA 1 +#define CMD_GET_WIDTH 2 +#define CMD_GET_COUNT 3 +#define CMD_SET_DELAY 4 +#define CMD_SET_STOP 5 +#define CMD_GET_OFFSET 6 -constexpr int ilog2(int n) { - return (n > 1) ? 1 + ilog2(n >> 1) : 0; -} +static constexpr int num_modules = sizeof(scope_modules) / sizeof(scope_module_t); -static constexpr int NW_BITS = ilog2(NUM_WARPS); - -#ifdef EXT_F_ENABLE -static constexpr int NR_BITS = ilog2(64); -#else -static constexpr int NR_BITS = ilog2(32); -#endif - -static constexpr int EX_BITS = 3; -static constexpr int OP_BITS = 4; -static constexpr int MOD_BITS = 3; - -static constexpr int ICORE_TAG_WIDTH = NW_BITS; -static constexpr int DCORE_TAG_WIDTH = ilog2(LSUQ_SIZE); - -static constexpr scope_signal_t scope_signals[] = { - - { 32, "dram_req_addr" }, - { 1, "dram_req_rw" }, - { 16, "dram_req_byteen" }, - { 128, "dram_req_data" }, - { 29, "dram_req_tag" }, - { 128, "dram_rsp_data" }, - { 29, "dram_rsp_tag" }, - - { 32, "snp_req_addr" }, - { 1, "snp_req_invalidate" }, - { 16, "snp_req_tag" }, - { 16, "snp_rsp_tag" }, - - { NW_BITS, "icache_req_wid" }, - { 32, "icache_req_addr" }, - { ICORE_TAG_WIDTH, "icache_req_tag" }, - { 32, "icache_rsp_data" }, - { ICORE_TAG_WIDTH, "icache_rsp_tag" }, - - { NW_BITS, "dcache_req_wid" }, - { 32, "dcache_req_pc" }, - { NUM_THREADS * 32, "dcache_req_addr" }, - { 1, "dcache_req_rw" }, - { NUM_THREADS * 4, "dcache_req_byteen" }, - { NUM_THREADS * 32, "dcache_req_data" }, - { DCORE_TAG_WIDTH, "dcache_req_tag" }, - { NUM_THREADS * 32, "dcache_rsp_data" }, - { DCORE_TAG_WIDTH, "dcache_rsp_tag" }, - - { NW_BITS, "issue_wid" }, - { NUM_THREADS, "issue_tmask" }, - { 32, "issue_pc" }, - { EX_BITS, "issue_ex_type" }, - { OP_BITS, "issue_op_type" }, - { MOD_BITS, "issue_op_mod" }, - { 1, "issue_wb" }, - { NR_BITS, "issue_rd" }, - { NR_BITS, "issue_rs1" }, - { NR_BITS, "issue_rs2" }, - { NR_BITS, "issue_rs3" }, - { 32, "issue_imm" }, - { 1, "issue_rs1_is_pc" }, - { 1, "issue_rs2_is_imm" }, - - { NW_BITS, "gpr_rsp_wid" }, - { 32, "gpr_rsp_pc" }, - { NUM_THREADS * 32, "gpr_rsp_a" }, - { NUM_THREADS * 32, "gpr_rsp_b" }, - { NUM_THREADS * 32, "gpr_rsp_c" }, - - { NW_BITS, "writeback_wid" }, - { 32, "writeback_pc" }, - { NR_BITS, "writeback_rd" }, - { NUM_THREADS * 32, "writeback_data" }, - - { 32, "bank_addr_st0" }, - { 32, "bank_addr_st1" }, - { 32, "bank_addr_st2" }, - { 1, "scope_bank_is_mrvq_st1" }, - { 1, "scope_bank_miss_st1" }, - { 1, "scope_bank_dirty_st1" }, - { 1, "scope_bank_force_miss_st1" }, - - /////////////////////////////////////////////////////////////////////////// - - { 1, "dram_req_valid" }, - { 1, "dram_req_ready" }, - { 1, "dram_rsp_valid" }, - { 1, "dram_rsp_ready" }, - - { 1, "snp_req_valid" }, - { 1, "snp_req_ready" }, - { 1, "snp_rsp_valid" }, - { 1, "snp_rsp_ready" }, - - { 1, "icache_req_valid" }, - { 1, "icache_req_ready" }, - { 1, "icache_rsp_valid" }, - { 1, "icache_rsp_ready" }, - - { NUM_THREADS, "dcache_req_valid" }, - { 1, "dcache_req_ready" }, - { NUM_THREADS, "dcache_rsp_valid" }, - { 1, "dcache_rsp_ready" }, - - { 1, "bank_valid_st0" }, - { 1, "bank_valid_st1" }, - { 1, "bank_valid_st2" }, - { 1, "bank_stall_pipe" }, - - { 1, "issue_valid" }, - { 1, "issue_ready" }, - { 1, "gpr_rsp_valid" }, - { 1, "writeback_valid" }, - { 1, "scoreboard_delay" }, - { 1, "gpr_delay" }, - { 1, "execute_delay" }, - { 1, "busy" }, -}; - -static constexpr int num_signals = sizeof(scope_signals) / sizeof(scope_signal_t); +static constexpr int num_taps = sizeof(scope_taps) / sizeof(scope_tap_t); constexpr int calcFrameWidth(int index = 0) { - return (index < num_signals) ? (scope_signals[index].width + calcFrameWidth(index + 1)) : 0; + return (index < num_taps) ? (scope_taps[index].width + calcFrameWidth(index + 1)) : 0; } static constexpr int fwidth = calcFrameWidth(); -static_assert(fwidth == 1766, "invalid size"); + +#ifdef HANG_TIMEOUT +static std::thread g_timeout_thread; +static std::mutex g_timeout_mutex; + +static void timeout_callback(fpga_handle fpga) { + std::this_thread::sleep_for(std::chrono::seconds{60}); + vx_scope_stop(fpga, HANG_TIMEOUT); + fpgaClose(fpga); + exit(0); +} +#endif + +uint64_t print_clock(std::ofstream& ofs, uint64_t delta, uint64_t timestamp) { + while (delta != 0) { + ofs << '#' << timestamp++ << std::endl; + ofs << "b0 0" << std::endl; + ofs << '#' << timestamp++ << std::endl; + ofs << "b1 0" << std::endl; + --delta; + } + return timestamp; +} + +void dump_taps(std::ofstream& ofs, int module) { + for (int i = 0; i < num_taps; ++i) { + auto& tap = scope_taps[i]; + if (tap.module != module) + continue; + ofs << "$var reg " << tap.width << " " << (i + 1) << " " << tap.name << " $end" << std::endl; + } +} + +void dump_module(std::ofstream& ofs, int parent) { + for (auto& module : scope_modules) { + if (module.parent != parent) + continue; + if (module.name[0] == '*') { + ofs << "$var reg 1 0 clk $end" << std::endl; + } else { + ofs << "$scope module " << module.name << " $end" << std::endl; + } + dump_module(ofs, module.index); + dump_taps(ofs, module.index); + if (module.name[0] != '*') { + ofs << "$upscope $end" << std::endl; + } + } +} int vx_scope_start(fpga_handle hfpga, uint64_t delay) { if (nullptr == hfpga) @@ -169,36 +107,55 @@ int vx_scope_start(fpga_handle hfpga, uint64_t delay) { if (delay != uint64_t(-1)) { // set start delay - uint64_t cmd_delay = ((delay << 3) | 4); + uint64_t cmd_delay = ((delay << 3) | CMD_SET_DELAY); CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_delay)); std::cout << "scope start delay: " << delay << std::endl; } +#ifdef HANG_TIMEOUT + g_timeout_thread = std::thread(timeout_callback, hfpga); + g_timeout_thread.detach(); +#endif + return 0; } int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { +#ifdef HANG_TIMEOUT + if (!g_timeout_mutex.try_lock()) + return 0; +#endif + if (nullptr == hfpga) return -1; if (delay != uint64_t(-1)) { // stop recording - uint64_t cmd_stop = ((delay << 3) | 5); + uint64_t cmd_stop = ((delay << 3) | CMD_SET_STOP); CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_stop)); std::cout << "scope stop delay: " << delay << std::endl; } std::ofstream ofs("vx_scope.vcd"); + ofs << "$version Generated by Vortex Scope $end" << std::endl; ofs << "$timescale 1 ns $end" << std::endl; - ofs << "$var reg 1 0 clk $end" << std::endl; - + ofs << "$scope module TOP $end" << std::endl; + + dump_module(ofs, -1); + dump_taps(ofs, -1); + ofs << "$upscope $end" << std::endl; ofs << "enddefinitions $end" << std::endl; - - uint64_t frame_width, max_frames, data_valid; + + uint64_t frame_width, max_frames, data_valid, offset, delta; + uint64_t timestamp = 0; + uint64_t frame_offset = 0; + uint64_t frame_no = 0; + int signal_id = 0; + int signal_offset = 0; // wait for recording to terminate - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID)); do { CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid)); if (data_valid) @@ -208,65 +165,50 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { std::cout << "scope trace dump begin..." << std::endl; - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 2)); + // get frame width + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_WIDTH)); CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &frame_width)); - std::cout << "scope::frame_width=" << std::dec << frame_width << std::endl; - - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 3)); - CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &max_frames)); - std::cout << "scope::max_frames=" << std::dec << max_frames << std::endl; - - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 1)); + std::cout << "scope::frame_width=" << std::dec << frame_width << std::endl; if (fwidth != (int)frame_width) { std::cerr << "invalid frame_width: expecting " << std::dec << fwidth << "!" << std::endl; std::abort(); } + + // get max frames + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_COUNT)); + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &max_frames)); + std::cout << "scope::max_frames=" << std::dec << max_frames << std::endl; + + // get offset + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_OFFSET)); + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &offset)); + + // get data + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_DATA)); + + // print clock header + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta)); + timestamp = print_clock(ofs, offset + delta + 2, timestamp); + signal_id = num_taps; + std::vector signal_data(frame_width+1); - - uint64_t frame_offset = 0; - uint64_t frame_no = 0; - uint64_t timestamp = 0; - int signal_id = 0; - int signal_offset = 0; - - auto print_header = [&] () { - ofs << '#' << timestamp++ << std::endl; - ofs << "b0 0" << std::endl; - ofs << '#' << timestamp++ << std::endl; - ofs << "b1 0" << std::endl; - - uint64_t delta; - fpga_result res = fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta); - assert(res == FPGA_OK); - - while (delta != 0) { - ofs << '#' << timestamp++ << std::endl; - ofs << "b0 0" << std::endl; - ofs << '#' << timestamp++ << std::endl; - ofs << "b1 0" << std::endl; - --delta; - } - - signal_id = num_signals; - }; - - print_header(); do { if (frame_no == (max_frames-1)) { // verify last frame is valid - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID)); CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid)); assert(data_valid == 1); - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 1)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_DATA)); } + // read next data words uint64_t word; CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &word)); do { - int signal_width = scope_signals[signal_id-1].width; + int signal_width = scope_taps[signal_id-1].width; int word_offset = frame_offset % 64; signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0'; @@ -285,17 +227,26 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { assert(0 == signal_offset); frame_offset = 0; ++frame_no; - if (frame_no != max_frames) { - print_header(); - } + + if (frame_no != max_frames) { + // print clock header + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta)); + timestamp = print_clock(ofs, delta + 1, timestamp); + signal_id = num_taps; + if (0 == (frame_no % 100)) { + std::cout << "*** " << frame_no << " frames, timestamp=" << timestamp << std::endl; + } + } } + } while ((frame_offset % 64) != 0); + } while (frame_no != max_frames); std::cout << "scope trace dump done! - " << (timestamp/2) << " cycles" << std::endl; // verify data not valid - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID)); CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid)); assert(data_valid == 0); diff --git a/driver/opae/vx_scope.h b/driver/opae/vx_scope.h index f2d5518e..2bb09c4a 100644 --- a/driver/opae/vx_scope.h +++ b/driver/opae/vx_scope.h @@ -1,5 +1,7 @@ #pragma once +#define HANG_TIMEOUT 60 + int vx_scope_start(fpga_handle hfpga, uint64_t delay = -1); int vx_scope_stop(fpga_handle hfpga, uint64_t delay = -1); \ No newline at end of file diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 1f0c36cf..1a66b335 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -12,6 +12,8 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSRQ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE +DBG_PRINT_FLAGS += -DDBG_CORE_REQ_INFO +DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO @@ -53,7 +55,7 @@ VL_FLAGS += verilator.vlt # Debugigng ifdef DEBUG - VL_FLAGS += -DVCD_OUTPUT --assert --trace $(DBG_FLAGS) + VL_FLAGS += -DVCD_OUTPUT --assert --trace-fst --trace-threads 1 $(DBG_FLAGS) CFLAGS += -DVCD_OUTPUT $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG diff --git a/driver/tests/basic/kernel.bin b/driver/tests/basic/kernel.bin index 87fd4c93..21381a65 100755 Binary files a/driver/tests/basic/kernel.bin and b/driver/tests/basic/kernel.bin differ diff --git a/driver/tests/basic/kernel.dump b/driver/tests/basic/kernel.dump index f5e2da2a..76eccc8c 100644 --- a/driver/tests/basic/kernel.dump +++ b/driver/tests/basic/kernel.dump @@ -74,7 +74,7 @@ Disassembly of section .text: 800000e0: 0005006b 0x5006b 800000e4: 00002197 auipc gp,0x2 800000e8: c8418193 addi gp,gp,-892 # 80001d68 <__global_pointer$> -800000ec: f14025f3 csrr a1,mhartid +800000ec: 022025f3 csrr a1,0x22 800000f0: 00a59593 slli a1,a1,0xa 800000f4: 02002673 csrr a2,0x20 800000f8: 00261613 slli a2,a2,0x2 @@ -122,7 +122,7 @@ Disassembly of section .text: 80000158: 00008067 ret 8000015c : -8000015c: f1402573 csrr a0,mhartid +8000015c: 02202573 csrr a0,0x22 80000160: 00008067 ret 80000164 : @@ -458,13 +458,12 @@ Disassembly of section .comment: Disassembly of section .riscv.attributes: 00000000 <.riscv.attributes>: - 0: 2041 jal 80 <_start-0x7fffff80> + 0: 2541 jal 680 <_start-0x7ffff980> 2: 0000 unimp 4: 7200 flw fs0,32(a2) 6: 7369 lui t1,0xffffa 8: 01007663 bgeu zero,a6,14 <_start-0x7fffffec> - c: 0016 c.slli zero,0x5 - e: 0000 unimp + c: 0000001b 0x1b 10: 1004 addi s1,sp,32 12: 7205 lui tp,0xfffe1 14: 3376 fld ft6,376(sp) @@ -473,4 +472,4 @@ Disassembly of section .riscv.attributes: 1a: 5f30 lw a2,120(a4) 1c: 326d jal fffff9c6 <__global_pointer$+0x7fffdc5e> 1e: 3070 fld fa2,224(s0) - ... + 20: 665f 7032 0030 0x307032665f diff --git a/driver/tests/basic/kernel.elf b/driver/tests/basic/kernel.elf index 90887d0e..31205987 100755 Binary files a/driver/tests/basic/kernel.elf and b/driver/tests/basic/kernel.elf differ diff --git a/driver/tests/dogfood/dogfood.cpp b/driver/tests/dogfood/dogfood.cpp index 3cbd5fea..5d2e6016 100644 --- a/driver/tests/dogfood/dogfood.cpp +++ b/driver/tests/dogfood/dogfood.cpp @@ -90,16 +90,20 @@ vx_buffer_h dst_buf = nullptr; static void show_usage() { std::cout << "Vortex Driver Test." << std::endl; - std::cout << "Usage: [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl; + std::cout << "Usage: [-t:testid] [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl; } static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "n:s:e:k:ch?")) != -1) { + while ((c = getopt(argc, argv, "n:t:s:e:k:ch?")) != -1) { switch (c) { case 'n': count = atoi(optarg); break; + case 't': + testid_s = atoi(optarg); + testid_e = atoi(optarg); + break; case 's': testid_s = atoi(optarg); break; diff --git a/hw/Makefile b/hw/Makefile index e4e79593..635b4d15 100644 --- a/hw/Makefile +++ b/hw/Makefile @@ -5,5 +5,5 @@ build_config: $(MAKE) -C simulate clean: - rm ./rtl/VX_user_config.vh ./VX_config.h + rm -f ./rtl/VX_user_config.vh ./VX_config.h $(MAKE) -C simulate clean \ No newline at end of file diff --git a/hw/opae/README b/hw/opae/README index 303f48df..e05b1df2 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -60,9 +60,9 @@ qsub-sim make ase # tests -./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n 256 +./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n16 ./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16 -./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n1 -s4 -e4 +./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n16 ./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd # modify "vsim_run.tcl" to dump VCD trace @@ -76,11 +76,12 @@ tar -zcvf output_files_1c.tar.gz `find ./build_fpga_1c -type f \( -iname \*.rpt # compress VCD trace tar -zcvf vortex.vcd.tar.gz ./build_ase_1c/work/vortex.vcd tar -zcvf trace.vcd.tar.gz obj_dir/trace.vcd -tar -zcvf trace.vcd.tar.gz trace.vcd +tar -zcvf trace.fst.tar.gz trace.fst run.log tar -zcvf run.log.tar.gz run.log tar -cvjf vortex.vcd.tar.bz2 build_ase_1c/work/vortex.vcd tar -zcvf vortex.vcd.tar.gz build_ase_1c/work/vortex.vcd tar -zcvf run.log.tar.gz build_ase_1c/work/run.log +tar -zcvf vx_scope.vcd.tar.gz vx_scope.vcd # decompress VCD trace tar -zxvf /mnt/c/Users/Blaise/Downloads/vortex.vcd.tar.gz @@ -96,7 +97,7 @@ kill -9 # fixing device resource busy issue when deleting /build_ase_1c/ lsof +D build_ase_1c -# quick off cache synthesis +# quick off synthesis make -C pipeline clean && make -C pipeline > pipeline/build.log 2>&1 & make -C cache clean && make -C cache > cache/build.log 2>&1 & make -C core clean && make -C core > core/build.log 2>&1 & diff --git a/hw/opae/ccip_std_afu.sv b/hw/opae/ccip_std_afu.sv index f56bb80f..1590e82f 100644 --- a/hw/opae/ccip_std_afu.sv +++ b/hw/opae/ccip_std_afu.sv @@ -104,7 +104,7 @@ module ccip_std_afu #( vortex_afu #( .NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS) - ) vortex_afu_inst ( + ) afu ( .clk (clk), .reset (reset_T1), diff --git a/hw/opae/vortex_afu.qsf b/hw/opae/vortex_afu.qsf index 96b1c98b..c24f3549 100644 --- a/hw/opae/vortex_afu.qsf +++ b/hw/opae/vortex_afu.qsf @@ -1,9 +1,26 @@ # Analysis & Synthesis Assignments set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009 -# set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON +set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON set_global_assignment -name VERILOG_MACRO QUARTUS set_global_assignment -name VERILOG_MACRO SYNTHESIS set_global_assignment -name VERILOG_MACRO NDEBUG set_global_assignment -name MESSAGE_DISABLE 16818 -set_global_assignment -name VERILOG_MACRO FPU_FAST \ No newline at end of file +set_global_assignment -name VERILOG_MACRO FPU_FAST + +set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0 +set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100 +set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)" +set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON +set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON +set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON +set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON +set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON +set_global_assignment -name POWER_USE_TA_VALUE 65 +set_global_assignment -name SEED 1 +set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON +set_global_assignment -name FITTER_EFFORT "STANDARD FIT" +set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" +set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED +set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM +set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" \ No newline at end of file diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 23ce2ea9..41110da1 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -74,111 +74,112 @@ localparam MMIO_CSR_ADDR = `AFU_IMAGE_MMIO_CSR_ADDR; localparam MMIO_CSR_DATA = `AFU_IMAGE_MMIO_CSR_DATA; localparam MMIO_CSR_READ = `AFU_IMAGE_MMIO_CSR_READ; -logic [127:0] afu_id = `AFU_ACCEL_UUID; +localparam CCI_RD_RQ_TAGW = $clog2(CCI_RD_WINDOW_SIZE); +localparam CCI_RD_RQ_DATAW = $bits(t_ccip_clData) + CCI_RD_RQ_TAGW; -typedef enum logic[3:0] { - STATE_IDLE, - STATE_READ, - STATE_WRITE, - STATE_START, - STATE_RUN, - STATE_CLFLUSH, - STATE_CSR_READ, - STATE_CSR_WRITE -} state_t; - -typedef logic [$clog2(CCI_RD_WINDOW_SIZE)-1:0] t_cci_rdq_tag; -typedef logic [$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:0] t_cci_rdq_data; - -state_t state; +localparam STATE_IDLE = 0; +localparam STATE_READ = 1; +localparam STATE_WRITE = 2; +localparam STATE_START = 3; +localparam STATE_RUN = 4; +localparam STATE_CLFLUSH = 5; +localparam STATE_CSR_READ = 6; +localparam STATE_CSR_WRITE = 7; +localparam STATE_MAX_VALUE = 8; +localparam STATE_WIDTH = $clog2(STATE_MAX_VALUE); `ifdef SCOPE -`SCOPE_SIGNALS_DECL +`SCOPE_DECL_SIGNALS `endif +wire [127:0] afu_id = `AFU_ACCEL_UUID; + +reg [STATE_WIDTH-1:0] state; + // Vortex ports /////////////////////////////////////////////////////////////// -logic vx_dram_req_valid; -logic vx_dram_req_rw; -logic [`VX_DRAM_BYTEEN_WIDTH-1:0] vx_dram_req_byteen; -logic [`VX_DRAM_ADDR_WIDTH-1:0] vx_dram_req_addr; -logic [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_req_data; -logic [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_req_tag; -logic vx_dram_req_ready; +wire vx_dram_req_valid; +wire vx_dram_req_rw; +wire [`VX_DRAM_BYTEEN_WIDTH-1:0] vx_dram_req_byteen; +wire [`VX_DRAM_ADDR_WIDTH-1:0] vx_dram_req_addr; +wire [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_req_data; +wire [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_req_tag; +wire vx_dram_req_ready; -logic vx_dram_rsp_valid; -logic [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data; -logic [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_rsp_tag; -logic vx_dram_rsp_ready; +wire vx_dram_rsp_valid; +wire [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data; +wire [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_rsp_tag; +wire vx_dram_rsp_ready; -logic vx_snp_req_valid; -logic [`VX_DRAM_ADDR_WIDTH-1:0] vx_snp_req_addr; -logic vx_snp_req_invalidate = 0; -logic [`VX_SNP_TAG_WIDTH-1:0] vx_snp_req_tag; -logic vx_snp_req_ready; +reg vx_snp_req_valid; +reg [`VX_DRAM_ADDR_WIDTH-1:0] vx_snp_req_addr; +wire vx_snp_req_invalidate = 0; +reg [`VX_SNP_TAG_WIDTH-1:0] vx_snp_req_tag; +wire vx_snp_req_ready; -logic vx_snp_rsp_valid; +reg vx_snp_rsp_valid; `DEBUG_BEGIN -logic [`VX_SNP_TAG_WIDTH-1:0] vx_snp_rsp_tag; +reg [`VX_SNP_TAG_WIDTH-1:0] vx_snp_rsp_tag; `DEBUG_END -logic vx_snp_rsp_ready; +reg vx_snp_rsp_ready; -logic vx_csr_io_req_valid; -logic [`VX_CSR_ID_WIDTH-1:0] vx_csr_io_req_coreid; -logic [11:0] vx_csr_io_req_addr; -logic vx_csr_io_req_rw; -logic [31:0] vx_csr_io_req_data; -logic vx_csr_io_req_ready; +wire vx_csr_io_req_valid; +wire [`VX_CSR_ID_WIDTH-1:0] vx_csr_io_req_coreid; +wire [11:0] vx_csr_io_req_addr; +wire vx_csr_io_req_rw; +wire [31:0] vx_csr_io_req_data; +wire vx_csr_io_req_ready; -logic vx_csr_io_rsp_valid; -logic [31:0] vx_csr_io_rsp_data; -logic vx_csr_io_rsp_ready; +wire vx_csr_io_rsp_valid; +wire [31:0] vx_csr_io_rsp_data; +wire vx_csr_io_rsp_ready; -logic vx_reset; -logic vx_busy; +reg vx_reset; +wire vx_busy; // AVS Queues ///////////////////////////////////////////////////////////////// -logic avs_rtq_push; -logic avs_rtq_pop; +wire avs_rtq_push; +wire avs_rtq_pop; `DEBUG_BEGIN -logic avs_rtq_empty; -logic avs_rtq_full; +wire avs_rtq_empty; +wire avs_rtq_full; `DEBUG_BEGIN -logic avs_rdq_push; -logic avs_rdq_pop; +wire avs_rdq_push; +wire avs_rdq_pop; t_local_mem_data avs_rdq_dout; -logic avs_rdq_empty; +wire avs_rdq_empty; `DEBUG_BEGIN -logic avs_rdq_full; +wire avs_rdq_full; `DEBUG_END // CMD variables ////////////////////////////////////////////////////////////// t_ccip_clAddr cmd_io_addr; -logic[DRAM_ADDR_WIDTH-1:0] cmd_mem_addr; -logic[DRAM_ADDR_WIDTH-1:0] cmd_data_size; +reg [DRAM_ADDR_WIDTH-1:0] cmd_mem_addr; +reg [DRAM_ADDR_WIDTH-1:0] cmd_data_size; `ifdef SCOPE -logic [63:0] cmd_scope_rdata; -logic [63:0] cmd_scope_wdata; -logic cmd_scope_read; -logic cmd_scope_write; +wire [63:0] cmd_scope_rdata; +wire [63:0] cmd_scope_wdata; +wire cmd_scope_read; +wire cmd_scope_write; `endif -logic [`VX_CSR_ID_WIDTH-1:0] cmd_csr_core; -logic [11:0] cmd_csr_addr; -logic [31:0] cmd_csr_rdata; -logic [31:0] cmd_csr_wdata; +reg [`VX_CSR_ID_WIDTH-1:0] cmd_csr_core; +reg [11:0] cmd_csr_addr; +reg [31:0] cmd_csr_rdata; +reg [31:0] cmd_csr_wdata; // MMIO controller //////////////////////////////////////////////////////////// `IGNORE_WARNINGS_BEGIN -t_ccip_c0_ReqMmioHdr mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr); +t_ccip_c0_ReqMmioHdr mmio_hdr; `IGNORE_WARNINGS_END +assign mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr); -`STATIC_ASSERT(($bits(t_ccip_c0_ReqMmioHdr)-$bits(mmio_hdr.address)) == 12, "Oops!") +`STATIC_ASSERT(($bits(t_ccip_c0_ReqMmioHdr)-$bits(mmio_hdr.address)) == 12, ("Oops!")) t_if_ccip_c2_Tx mmio_tx; assign af2cp_sTxPort.c2 = mmio_tx; @@ -192,6 +193,10 @@ assign cmd_scope_write = cp2af_sRxPort.c0.mmioWrValid && (MMIO_SCOPE_WRITE == mm `DEBUG_BEGIN wire cp2af_sRxPort_c0_mmioWrValid = cp2af_sRxPort.c0.mmioWrValid; wire cp2af_sRxPort_c0_mmioRdValid = cp2af_sRxPort.c0.mmioRdValid; +wire cp2af_sRxPort_c0_rspValid = cp2af_sRxPort.c0.rspValid; +wire cp2af_sRxPort_c1_rspValid = cp2af_sRxPort.c1.rspValid; +wire cp2af_sRxPort_c0TxAlmFull = cp2af_sRxPort.c0TxAlmFull; +wire cp2af_sRxPort_c1TxAlmFull = cp2af_sRxPort.c1TxAlmFull; wire[$bits(mmio_hdr.address)-1:0] mmio_hdr_address = mmio_hdr.address; wire[$bits(mmio_hdr.length)-1:0] mmio_hdr_length = mmio_hdr.length; wire[$bits(mmio_hdr.tid)-1:0] mmio_hdr_tid = mmio_hdr.tid; @@ -200,75 +205,93 @@ wire[$bits(cp2af_sRxPort.c0.hdr.mdata)-1:0] cp2af_sRxPort_c0_hdr_mdata = cp2af_s wire [2:0] cmd_type = (cp2af_sRxPort.c0.mmioWrValid && (MMIO_CMD_TYPE == mmio_hdr.address)) ? 3'(cp2af_sRxPort.c0.data) : 3'h0; -always_ff @(posedge clk) -begin +`ifdef SCOPE +reg scope_start; +`endif + +// disable assertions until reset +`ifndef VERILATOR +initial begin + $assertoff; +end +`endif + +always @(posedge clk) begin if (reset) begin + `ifndef VERILATOR + $asserton; // enable assertions + `endif + mmio_tx.hdr <= 0; mmio_tx.data <= 0; mmio_tx.mmioRdValid <= 0; cmd_io_addr <= 0; cmd_mem_addr <= 0; cmd_data_size <= 0; + `ifdef SCOPE + scope_start <= 0; + `endif end else begin - mmio_tx.mmioRdValid <= 0; - // serve MMIO write request if (cp2af_sRxPort.c0.mmioWrValid) begin + `ifdef SCOPE + scope_start <= 1; + `endif case (mmio_hdr.address) MMIO_IO_ADDR: begin cmd_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_IO_ADDR: 0x%0h", $time, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_IO_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); `endif end MMIO_MEM_ADDR: begin cmd_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_MEM_ADDR: 0x%0h", $time, t_local_mem_addr'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_MEM_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, t_local_mem_addr'(cp2af_sRxPort.c0.data)); `endif end MMIO_DATA_SIZE: begin cmd_data_size <= $bits(cmd_data_size)'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_DATA_SIZE: %0d", $time, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_DATA_SIZE: addr=%0h, data=%0d", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data)); `endif end MMIO_CMD_TYPE: begin `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_CMD_TYPE: %0d", $time, $bits(cmd_type)'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_CMD_TYPE: addr=%0h, data=%0d", $time, mmio_hdr.address, $bits(cmd_type)'(cp2af_sRxPort.c0.data)); `endif end `ifdef SCOPE MMIO_SCOPE_WRITE: begin `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_SCOPE_WRITE: %0h", $time, 64'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_SCOPE_WRITE: addr=%0h, data=%0h", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)); `endif end `endif MMIO_CSR_CORE: begin cmd_csr_core <= $bits(cmd_csr_core)'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_CSR_CORE: %0h", $time, $bits(cmd_csr_core)'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_CSR_CORE: addr=%0h, %0h", $time, mmio_hdr.address, $bits(cmd_csr_core)'(cp2af_sRxPort.c0.data)); `endif end MMIO_CSR_ADDR: begin cmd_csr_addr <= $bits(cmd_csr_addr)'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_CSR_ADDR: %0h", $time, $bits(cmd_csr_addr)'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_CSR_ADDR: addr=%0h, %0h", $time, mmio_hdr.address, $bits(cmd_csr_addr)'(cp2af_sRxPort.c0.data)); `endif end MMIO_CSR_DATA: begin cmd_csr_wdata <= $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_CSR_DATA: %0h", $time, $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_CSR_DATA: addr=%0h, %0h", $time, mmio_hdr.address, $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data)); `endif end default: begin `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_WR: addr=%0h, data=%0h", $time, mmio_hdr.address, $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data)); + $display("%t: Unknown MMIO Wr: addr=%0h, data=%0h", $time, mmio_hdr.address, $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data)); `endif end endcase @@ -296,28 +319,29 @@ begin MMIO_STATUS: begin mmio_tx.data <= 64'(state); `ifdef DBG_PRINT_OPAE - if (state != state_t'(mmio_tx.data)) begin - $display("%t: MMIO_STATUS: state=%0d", $time, state); + if (state != STATE_WIDTH'(mmio_tx.data)) begin + $display("%t: MMIO_STATUS: addr=%0h, state=%0d", $time, mmio_hdr.address, state); end `endif end MMIO_CSR_READ: begin mmio_tx.data <= 64'(cmd_csr_rdata); `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_CSR_READ: data=%0h", $time, cmd_csr_rdata); + $display("%t: MMIO_CSR_READ: addr=%0h, data=%0h", $time, mmio_hdr.address, cmd_csr_rdata); `endif end `ifdef SCOPE MMIO_SCOPE_READ: begin mmio_tx.data <= cmd_scope_rdata; `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_SCOPE_READ: data=%0h", $time, cmd_scope_rdata); + $display("%t: MMIO_SCOPE_READ: addr=%0h, data=%0h", $time, mmio_hdr.address, cmd_scope_rdata); `endif end `endif default: begin - `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_RD: addr=%0h", $time, mmio_hdr.address); + mmio_tx.data <= 64'h0; + `ifdef DBG_PRINT_OPAE + $display("%t: Unknown MMIO Rd: addr=%0h", $time, mmio_hdr.address); `endif end endcase @@ -328,14 +352,13 @@ end // COMMAND FSM //////////////////////////////////////////////////////////////// -logic cmd_read_done; -logic cmd_write_done; -logic cmd_clflush_done; -logic cmd_csr_done; -logic cmd_run_done; +wire cmd_read_done; +wire cmd_write_done; +wire cmd_clflush_done; +wire cmd_csr_done; +wire cmd_run_done; -always_ff @(posedge clk) -begin +always @(posedge clk) begin if (reset) begin state <= STATE_IDLE; vx_reset <= 0; @@ -458,27 +481,28 @@ end // AVS Controller ///////////////////////////////////////////////////////////// -logic vortex_enabled; -logic cci_rdq_empty; -t_cci_rdq_data cci_rdq_dout; +wire vortex_enabled; +wire cci_rdq_empty; +wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_dout; -logic cci_dram_rd_req_fire; -logic cci_dram_wr_req_fire; -logic vx_dram_rd_req_fire; +wire cci_dram_rd_req_fire; +wire cci_dram_wr_req_fire; +wire vx_dram_rd_req_fire; `DEBUG_BEGIN -logic vx_dram_wr_req_fire; +wire vx_dram_wr_req_fire; `DEBUG_END -logic vx_dram_rd_rsp_fire; +wire vx_dram_rd_rsp_fire; t_local_mem_byte_mask vx_dram_req_byteen_; -logic [$clog2(AVS_RD_QUEUE_SIZE+1)-1:0] avs_pending_reads, avs_pending_reads_next; -logic [DRAM_LINE_LW-1:0] vx_dram_req_offset, vx_dram_rsp_offset; -logic [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr, cci_dram_wr_req_addr; +reg [$clog2(AVS_RD_QUEUE_SIZE+1)-1:0] avs_pending_reads; +wire [$clog2(AVS_RD_QUEUE_SIZE+1)-1:0] avs_pending_reads_next; +wire [DRAM_LINE_LW-1:0] vx_dram_req_offset, vx_dram_rsp_offset; +reg [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr, cci_dram_wr_req_addr; -logic cci_dram_rd_req_enable, cci_dram_wr_req_enable; -logic vx_dram_req_enable, vx_dram_rd_req_enable, vx_dram_wr_req_enable; +wire cci_dram_rd_req_enable, cci_dram_wr_req_enable; +wire vx_dram_req_enable, vx_dram_rd_req_enable, vx_dram_wr_req_enable; -logic [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_ctr, cci_dram_wr_req_ctr; +reg [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_ctr, cci_dram_wr_req_ctr; assign vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state); @@ -503,8 +527,8 @@ assign vx_dram_wr_req_fire = vx_dram_wr_req_enable && !avs_waitrequest; assign vx_dram_rd_rsp_fire = vx_dram_rsp_valid && vx_dram_rsp_ready; assign avs_pending_reads_next = avs_pending_reads - + (((cci_dram_rd_req_fire || vx_dram_rd_req_fire) && !avs_rdq_pop) ? 1 : - (~(cci_dram_rd_req_fire || vx_dram_rd_req_fire) && avs_rdq_pop) ? -1 : 0); + + $bits(avs_pending_reads)'(((cci_dram_rd_req_fire || vx_dram_rd_req_fire) && !avs_rdq_pop) ? 1 : + (~(cci_dram_rd_req_fire || vx_dram_rd_req_fire) && avs_rdq_pop) ? -1 : 0); if (`VX_DRAM_LINE_WIDTH != DRAM_LINE_WIDTH) begin assign vx_dram_req_offset = ((DRAM_LINE_LW)'(vx_dram_req_addr[(DRAM_LINE_LW-VX_DRAM_LINE_LW)-1:0])) << VX_DRAM_LINE_LW; @@ -514,11 +538,10 @@ end else begin assign vx_dram_req_byteen_ = vx_dram_req_byteen; end -always_comb -begin +always @(*) begin case (state) CMD_MEM_READ: avs_address = cci_dram_rd_req_addr; - CMD_MEM_WRITE: avs_address = cci_dram_wr_req_addr + ((DRAM_ADDR_WIDTH)'(t_cci_rdq_tag'(cci_rdq_dout))); + CMD_MEM_WRITE: avs_address = cci_dram_wr_req_addr + (DRAM_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout))); default: avs_address = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH]; endcase @@ -529,8 +552,8 @@ begin endcase case (state) - CMD_MEM_WRITE: avs_writedata = cci_rdq_dout[$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:$bits(t_cci_rdq_tag)]; - default: avs_writedata = (DRAM_LINE_WIDTH)'(vx_dram_req_data) << vx_dram_req_offset; + CMD_MEM_WRITE: avs_writedata = cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW]; + default: avs_writedata = DRAM_LINE_WIDTH'(vx_dram_req_data) << vx_dram_req_offset; endcase end @@ -539,8 +562,7 @@ assign avs_write = cci_dram_wr_req_enable || vx_dram_wr_req_enable; assign cmd_write_done = (cci_dram_wr_req_ctr >= cmd_data_size); -always_ff @(posedge clk) -begin +always @(posedge clk) begin if (reset) begin mem_bank_select <= 0; @@ -565,16 +587,16 @@ begin end if (cci_dram_rd_req_fire) begin - cci_dram_rd_req_addr <= cci_dram_rd_req_addr + 1; - cci_dram_rd_req_ctr <= cci_dram_rd_req_ctr - 1; + cci_dram_rd_req_addr <= cci_dram_rd_req_addr + DRAM_ADDR_WIDTH'(1); + cci_dram_rd_req_ctr <= cci_dram_rd_req_ctr - DRAM_ADDR_WIDTH'(1); `ifdef DBG_PRINT_OPAE $display("%t: AVS Rd Req: addr=%0h, rem=%0d, pending=%0d", $time, `DRAM_TO_BYTE_ADDR(avs_address), (cci_dram_rd_req_ctr - 1), avs_pending_reads_next); `endif end if (cci_dram_wr_req_fire) begin - cci_dram_wr_req_addr <= cci_dram_wr_req_addr + ((t_cci_rdq_tag'(cci_dram_wr_req_ctr) == $bits(t_cci_rdq_tag)'(CCI_RD_WINDOW_SIZE-1)) ? DRAM_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : DRAM_ADDR_WIDTH'(0)); - cci_dram_wr_req_ctr <= cci_dram_wr_req_ctr + 1; + cci_dram_wr_req_addr <= cci_dram_wr_req_addr + ((CCI_RD_RQ_TAGW'(cci_dram_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? DRAM_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : DRAM_ADDR_WIDTH'(0)); + cci_dram_wr_req_ctr <= cci_dram_wr_req_ctr + DRAM_ADDR_WIDTH'(1); `ifdef DBG_PRINT_OPAE $display("%t: AVS Wr Req: addr=%0h, data=%0h, rem=%0d", $time, `DRAM_TO_BYTE_ADDR(avs_address), avs_writedata, (cci_dram_wr_req_ctr + 1)); `endif @@ -633,7 +655,7 @@ VX_generic_queue #( // AVS data read response queue /////////////////////////////////////////////// -logic cci_wr_req_fire; +wire cci_wr_req_fire; assign avs_rdq_push = avs_readdatavalid; assign avs_rdq_pop = vx_dram_rd_rsp_fire || cci_wr_req_fire; @@ -655,41 +677,46 @@ VX_generic_queue #( // CCI-P Read Request /////////////////////////////////////////////////////////// -logic [$clog2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads, cci_pending_reads_next; -logic [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr, cci_rd_req_ctr_next; +reg [$clog2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads; +wire [$clog2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads_next; +reg [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr; +wire [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr_next; +wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag, cci_rd_rsp_tag; +reg [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_ctr; t_ccip_clAddr cci_rd_req_addr; -t_cci_rdq_tag cci_rd_rsp_ctr; -logic cci_rd_req_fire, cci_rd_rsp_fire; -logic cci_rd_req_enable, cci_rd_req_wait; +wire cci_rd_req_fire, cci_rd_rsp_fire; +reg cci_rd_req_enable, cci_rd_req_wait; -logic cci_rdq_push, cci_rdq_pop; -t_cci_rdq_data cci_rdq_din; +wire cci_rdq_push, cci_rdq_pop; +wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_din; -always_comb begin +always @(*) begin af2cp_sTxPort.c0.hdr = t_ccip_c0_ReqMemHdr'(0); af2cp_sTxPort.c0.hdr.address = cci_rd_req_addr; - af2cp_sTxPort.c0.hdr.mdata = t_ccip_mdata'(t_cci_rdq_tag'(cci_rd_req_ctr)); + af2cp_sTxPort.c0.hdr.mdata = t_ccip_mdata'(cci_rd_req_tag); end assign cci_rd_req_fire = af2cp_sTxPort.c0.valid && !cp2af_sRxPort.c0TxAlmFull; assign cci_rd_rsp_fire = (STATE_WRITE == state) && cp2af_sRxPort.c0.rspValid; -assign cci_rd_req_ctr_next = cci_rd_req_ctr + (cci_rd_req_fire ? 1 : 0); +assign cci_rd_req_tag = CCI_RD_RQ_TAGW'(cci_rd_req_ctr); +assign cci_rd_rsp_tag = CCI_RD_RQ_TAGW'(cp2af_sRxPort.c0.hdr.mdata); + +assign cci_rd_req_ctr_next = cci_rd_req_ctr + DRAM_ADDR_WIDTH'(cci_rd_req_fire ? 1 : 0); assign cci_rdq_pop = cci_dram_wr_req_fire; assign cci_rdq_push = cci_rd_rsp_fire; -assign cci_rdq_din = {cp2af_sRxPort.c0.data, t_cci_rdq_tag'(cp2af_sRxPort.c0.hdr.mdata)}; +assign cci_rdq_din = {cp2af_sRxPort.c0.data, cci_rd_rsp_tag}; assign cci_pending_reads_next = cci_pending_reads - + ((cci_rd_req_fire && !cci_rdq_pop) ? 1 : - (!cci_rd_req_fire && cci_rdq_pop) ? -1 : 0); + + $bits(cci_pending_reads)'((cci_rd_req_fire && !cci_rdq_pop) ? 1 : + (!cci_rd_req_fire && cci_rdq_pop) ? -1 : 0); assign af2cp_sTxPort.c0.valid = cci_rd_req_enable && !cci_rd_req_wait; // Send read requests to CCI -always_ff @(posedge clk) -begin +always @(posedge clk) begin if (reset) begin cci_rd_req_addr <= 0; cci_rd_req_ctr <= 0; @@ -717,21 +744,21 @@ begin if (cci_rd_req_fire) begin cci_rd_req_addr <= cci_rd_req_addr + 1; cci_rd_req_ctr <= cci_rd_req_ctr_next; - if (t_cci_rdq_tag'(cci_rd_req_ctr) == $bits(t_cci_rdq_tag)'(CCI_RD_WINDOW_SIZE-1)) begin - cci_rd_req_wait <= 1; // end current request batch + if (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin + cci_rd_req_wait <= 1; // end current request batch end `ifdef DBG_PRINT_OPAE - $display("%t: CCI Rd Req: addr=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, (cmd_data_size - cci_rd_req_ctr_next), cci_pending_reads_next); + $display("%t: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr_next), cci_pending_reads_next); `endif end if (cci_rd_rsp_fire) begin - cci_rd_rsp_ctr <= cci_rd_rsp_ctr + 1; - if (cci_rd_rsp_ctr == $bits(t_cci_rdq_tag)'(CCI_RD_WINDOW_SIZE-1)) begin - cci_rd_req_wait <= 0; // restart new request batch + cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1); + if (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin + cci_rd_req_wait <= 0; // restart new request batch end `ifdef DBG_PRINT_OPAE - $display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d", $time, t_cci_rdq_tag'(cp2af_sRxPort.c0.hdr.mdata), cci_rd_rsp_ctr); + $display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr); `endif end @@ -742,12 +769,11 @@ begin end cci_pending_reads <= cci_pending_reads_next; - end end VX_generic_queue #( - .DATAW($bits(t_ccip_clData) + $bits(t_cci_rdq_tag)), + .DATAW(CCI_RD_RQ_DATAW), .SIZE(CCI_RD_QUEUE_SIZE) ) cci_rd_req_queue ( .clk (clk), @@ -761,14 +787,36 @@ VX_generic_queue #( `UNUSED_PIN (size) ); +`ifdef VERILATOR +`DEBUG_BLOCK( + reg [CCI_RD_WINDOW_SIZE-1:0] dbg_cci_rd_rsp_mask; + always @(posedge clk) begin + if (reset) begin + dbg_cci_rd_rsp_mask <= 0; + end else begin + if (cci_rd_rsp_fire) begin + if (cci_rd_rsp_ctr == 0) begin + dbg_cci_rd_rsp_mask <= (CCI_RD_WINDOW_SIZE'(1) << cci_rd_rsp_tag); + end else begin + assert(!dbg_cci_rd_rsp_mask[cci_rd_rsp_tag]); + dbg_cci_rd_rsp_mask[cci_rd_rsp_tag] <= 1; + end + end + end + end +) +`endif + // CCI-P Write Request ////////////////////////////////////////////////////////// -logic [$clog2(CCI_RW_QUEUE_SIZE+1)-1:0] cci_pending_writes, cci_pending_writes_next; -logic [DRAM_ADDR_WIDTH-1:0] cci_wr_req_ctr; +reg [$clog2(CCI_RW_QUEUE_SIZE+1)-1:0] cci_pending_writes; +wire [$clog2(CCI_RW_QUEUE_SIZE+1)-1:0] cci_pending_writes_next; +reg [DRAM_ADDR_WIDTH-1:0] cci_wr_req_ctr; t_ccip_clAddr cci_wr_req_addr; -logic cci_wr_req_enable, cci_wr_rsp_fire; +reg cci_wr_req_enable; +wire cci_wr_rsp_fire; -always_comb begin +always @(*) begin af2cp_sTxPort.c1.hdr = t_ccip_c1_ReqMemHdr'(0); af2cp_sTxPort.c1.hdr.address = cci_wr_req_addr; af2cp_sTxPort.c1.hdr.sop = 1; // single line write mode @@ -779,15 +827,15 @@ assign cci_wr_req_fire = af2cp_sTxPort.c1.valid && !cp2af_sRxPort.c1TxAlmFull; assign cci_wr_rsp_fire = (STATE_READ == state) && cp2af_sRxPort.c1.rspValid; assign cci_pending_writes_next = cci_pending_writes - + ((cci_wr_req_fire && !cci_wr_rsp_fire) ? 1 : - (!cci_wr_req_fire && cci_wr_rsp_fire) ? -1 : 0); + + $bits(cci_pending_writes)'((cci_wr_req_fire && !cci_wr_rsp_fire) ? 1 : + (!cci_wr_req_fire && cci_wr_rsp_fire) ? -1 : 0); assign cmd_read_done = (0 == cci_wr_req_ctr) && (0 == cci_pending_writes); assign af2cp_sTxPort.c1.valid = cci_wr_req_enable && !avs_rdq_empty; // Send write requests to CCI -always_ff @(posedge clk) +always @(posedge clk) begin if (reset) begin cci_wr_req_addr <= 0; @@ -809,10 +857,10 @@ begin if (cci_wr_req_fire) begin assert(cci_wr_req_ctr != 0); - cci_wr_req_addr <= cci_wr_req_addr + 1; - cci_wr_req_ctr <= cci_wr_req_ctr - 1; + cci_wr_req_addr <= cci_wr_req_addr + t_ccip_clAddr'(1); + cci_wr_req_ctr <= cci_wr_req_ctr - DRAM_ADDR_WIDTH'(1); `ifdef DBG_PRINT_OPAE - $display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes_next); + $display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes_next, avs_rdq_dout); `endif end @@ -828,12 +876,12 @@ end // Vortex cache snooping ////////////////////////////////////////////////////// -logic [`VX_DRAM_ADDR_WIDTH-1:0] snp_req_size; -logic [`VX_DRAM_ADDR_WIDTH-1:0] snp_req_baseaddr; -logic [`VX_DRAM_ADDR_WIDTH-1:0] snp_req_ctr, snp_req_ctr_next; -logic [`VX_DRAM_ADDR_WIDTH-1:0] snp_rsp_ctr, snp_rsp_ctr_next; +wire [`VX_DRAM_ADDR_WIDTH-1:0] snp_req_size; +wire [`VX_DRAM_ADDR_WIDTH-1:0] snp_req_baseaddr; +reg [`VX_DRAM_ADDR_WIDTH-1:0] snp_req_ctr, snp_rsp_ctr; +wire [`VX_DRAM_ADDR_WIDTH-1:0] snp_req_ctr_next, snp_rsp_ctr_next; -logic vx_snp_req_fire, vx_snp_rsp_fire; +wire vx_snp_req_fire, vx_snp_rsp_fire; if (`VX_DRAM_LINE_WIDTH != DRAM_LINE_WIDTH) begin assign snp_req_baseaddr = {cmd_mem_addr, (`VX_DRAM_ADDR_WIDTH - DRAM_ADDR_WIDTH)'(0)}; @@ -846,13 +894,12 @@ end assign vx_snp_req_fire = vx_snp_req_valid && vx_snp_req_ready; assign vx_snp_rsp_fire = vx_snp_rsp_valid && vx_snp_rsp_ready; -assign snp_req_ctr_next = vx_snp_req_fire ? (snp_req_ctr + 1) : snp_req_ctr; -assign snp_rsp_ctr_next = vx_snp_rsp_fire ? (snp_rsp_ctr - 1) : snp_rsp_ctr; +assign snp_req_ctr_next = vx_snp_req_fire ? (snp_req_ctr + `VX_DRAM_ADDR_WIDTH'(1)) : snp_req_ctr; +assign snp_rsp_ctr_next = vx_snp_rsp_fire ? (snp_rsp_ctr - `VX_DRAM_ADDR_WIDTH'(1)) : snp_rsp_ctr; assign cmd_clflush_done = (0 == snp_rsp_ctr); -always_ff @(posedge clk) -begin +always @(posedge clk) begin if (reset) begin vx_snp_req_valid <= 0; vx_snp_req_addr <= 0; @@ -886,11 +933,11 @@ begin if (vx_snp_req_fire) begin assert(snp_req_ctr < snp_req_size); - vx_snp_req_addr <= vx_snp_req_addr + 1; + vx_snp_req_addr <= vx_snp_req_addr + `VX_DRAM_ADDR_WIDTH'(1); vx_snp_req_tag <= (`VX_SNP_TAG_WIDTH)'(snp_req_ctr_next); snp_req_ctr <= snp_req_ctr_next; `ifdef DBG_PRINT_OPAE - $display("%t: AFU Snp Req: addr=%0h, tag=%0d, rem=%0d", $time, `DRAM_TO_BYTE_ADDR(vx_snp_req_addr), (`VX_SNP_TAG_WIDTH)'(snp_req_ctr_next), (snp_req_size - snp_req_ctr_next)); + $display("%t: AFU Snp Req: addr=%0h, tag=%0d, rem=%0d", $time, `DRAM_TO_BYTE_ADDR(vx_snp_req_addr), (`VX_SNP_TAG_WIDTH)'(vx_snp_req_tag), (snp_req_size - snp_req_ctr_next)); `endif end @@ -907,7 +954,7 @@ end // CSRs/////////////////////////////////////////////////////////////////////// -logic csr_io_req_sent; +reg csr_io_req_sent; assign vx_csr_io_req_valid = !csr_io_req_sent && ((STATE_CSR_READ == state || STATE_CSR_WRITE == state)); @@ -920,8 +967,7 @@ assign vx_csr_io_rsp_ready = 1; assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_io_rsp_valid; -always_ff @(posedge clk) -begin +always @(posedge clk) begin if (reset) begin csr_io_req_sent <= 0; cmd_csr_rdata <= 0; @@ -946,11 +992,7 @@ end assign cmd_run_done = !vx_busy; Vortex #() vortex ( - `SCOPE_SIGNALS_ISTAGE_BIND - `SCOPE_SIGNALS_LSU_BIND - `SCOPE_SIGNALS_CACHE_BIND - `SCOPE_SIGNALS_ISSUE_BIND - `SCOPE_SIGNALS_EXECUTE_BIND + `SCOPE_BIND_top_vortex .clk (clk), .reset (reset | vx_reset), @@ -989,10 +1031,10 @@ Vortex #() vortex ( `UNUSED_PIN (io_req_addr), `UNUSED_PIN (io_req_data), `UNUSED_PIN (io_req_tag), - .io_req_ready (1), + .io_req_ready (1'b1), // I/O response - .io_rsp_valid (0), + .io_rsp_valid (1'b0), .io_rsp_data (0), .io_rsp_tag (0), `UNUSED_PIN (io_rsp_ready), @@ -1026,10 +1068,7 @@ end `ifdef SCOPE -localparam SCOPE_DATAW = $bits({`SCOPE_SIGNALS_DATA_LIST `SCOPE_SIGNALS_UPD_LIST}); -localparam SCOPE_SR_DEPTH = 2; - -`STATIC_ASSERT(SCOPE_DATAW == 1766, "invalid size") +`SCOPE_ASSIGN (scope_reset, vx_reset); `SCOPE_ASSIGN (scope_dram_req_valid, vx_dram_req_valid); `SCOPE_ASSIGN (scope_dram_req_addr, {vx_dram_req_addr, 4'b0}); @@ -1060,57 +1099,20 @@ localparam SCOPE_SR_DEPTH = 2; `SCOPE_ASSIGN (scope_busy, vx_busy); -wire scope_changed = (scope_icache_req_valid && scope_icache_req_ready) - || (scope_icache_rsp_valid && scope_icache_rsp_ready) - || ((| scope_dcache_req_valid) && scope_dcache_req_ready) - || ((| scope_dcache_rsp_valid) && scope_dcache_rsp_ready) - || (scope_dram_req_valid && scope_dram_req_ready) - || (scope_dram_rsp_valid && scope_dram_rsp_ready) - || (scope_snp_req_valid && scope_snp_req_ready) - || (scope_snp_rsp_valid && scope_snp_rsp_ready) - || (scope_issue_valid && scope_issue_ready) - || scope_gpr_rsp_valid - || scope_bank_valid_st0 - || scope_bank_valid_st1 - || scope_bank_valid_st2 - || scope_bank_stall_pipe - || scope_scoreboard_delay - || scope_gpr_delay - || scope_execute_delay - || scope_busy; - -wire scope_start = vx_reset; - -wire [SCOPE_DATAW+1:0] scope_data_in_st[SCOPE_SR_DEPTH-1:0]; -wire [SCOPE_DATAW+1:0] scope_data_in_ste; -assign scope_data_in_st[0] = {`SCOPE_SIGNALS_DATA_LIST `SCOPE_SIGNALS_UPD_LIST, scope_changed, scope_start}; -assign scope_data_in_ste = scope_data_in_st[SCOPE_SR_DEPTH-1]; - -for (genvar i = 1; i < SCOPE_SR_DEPTH; i++) begin - VX_generic_register #( - .N (SCOPE_DATAW+2) - ) scope_sr ( - .clk (clk), - .reset (reset), - .stall (0), - .flush (0), - .in (scope_data_in_st[i-1]), - .out (scope_data_in_st[i]) - ); -end +wire scope_changed = `SCOPE_TRIGGER; VX_scope #( - .DATAW (SCOPE_DATAW), + .DATAW ($bits({`SCOPE_DATA_LIST,`SCOPE_UPDATE_LIST})), .BUSW (64), .SIZE (4096), - .UPDW ($bits({`SCOPE_SIGNALS_UPD_LIST})) + .UPDW ($bits({`SCOPE_UPDATE_LIST})) ) scope ( .clk (clk), .reset (reset), - .start (scope_data_in_ste[0]), - .stop (0), - .changed (scope_data_in_ste[1]), - .data_in (scope_data_in_ste[SCOPE_DATAW+1:2]), + .start (scope_start), + .stop (1'b0), + .changed (scope_changed), + .data_in ({`SCOPE_DATA_LIST,`SCOPE_UPDATE_LIST}), .bus_in (cmd_scope_wdata), .bus_out (cmd_scope_rdata), .bus_read (cmd_scope_read), diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 47505cbc..6c899096 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -3,11 +3,7 @@ module VX_cluster #( parameter CLUSTER_ID = 0 ) ( - `SCOPE_SIGNALS_ISTAGE_IO - `SCOPE_SIGNALS_LSU_IO - `SCOPE_SIGNALS_CACHE_IO - `SCOPE_SIGNALS_ISSUE_IO - `SCOPE_SIGNALS_EXECUTE_IO + `SCOPE_IO_VX_cluster // Clock input wire clk, @@ -138,11 +134,7 @@ module VX_cluster #( VX_core #( .CORE_ID(i + (CLUSTER_ID * `NUM_CORES)) ) core ( - `SCOPE_SIGNALS_ISTAGE_BIND - `SCOPE_SIGNALS_LSU_BIND - `SCOPE_SIGNALS_CACHE_BIND - `SCOPE_SIGNALS_ISSUE_BIND - `SCOPE_SIGNALS_EXECUTE_BIND + `SCOPE_BIND_VX_cluster_core(i) .clk (clk), .reset (reset), @@ -380,7 +372,7 @@ module VX_cluster #( .SNP_REQ_TAG_WIDTH (`L2SNP_TAG_WIDTH), .SNP_FWD_TAG_WIDTH (`DSNP_TAG_WIDTH) ) l2cache ( - `SCOPE_SIGNALS_CACHE_UNBIND + `SCOPE_BIND_VX_cluster_l2cache .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 26470373..9f4ff5f7 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -59,8 +59,6 @@ `define EXT_F_ENABLE `endif -//`define FPU_FAST - // Device identification `define VENDOR_ID 0 `define ARCHITECTURE_ID 0 diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index f9582daf..aa1032a8 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -3,11 +3,7 @@ module VX_core #( parameter CORE_ID = 0 ) ( - `SCOPE_SIGNALS_ISTAGE_IO - `SCOPE_SIGNALS_LSU_IO - `SCOPE_SIGNALS_CACHE_IO - `SCOPE_SIGNALS_ISSUE_IO - `SCOPE_SIGNALS_EXECUTE_IO + `SCOPE_IO_VX_core // Clock input wire clk, @@ -179,10 +175,7 @@ module VX_core #( VX_pipeline #( .CORE_ID(CORE_ID) ) pipeline ( - `SCOPE_SIGNALS_ISTAGE_BIND - `SCOPE_SIGNALS_LSU_BIND - `SCOPE_SIGNALS_ISSUE_BIND - `SCOPE_SIGNALS_EXECUTE_BIND + `SCOPE_BIND_VX_core_pipeline .clk(clk), .reset(reset), @@ -258,7 +251,7 @@ module VX_core #( VX_mem_unit #( .CORE_ID(CORE_ID) ) mem_unit ( - `SCOPE_SIGNALS_CACHE_BIND + `SCOPE_BIND_VX_core_mem_unit .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 36ba7baf..40b76d83 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -7,7 +7,7 @@ module VX_csr_unit #( input wire reset, VX_cmt_to_csr_if cmt_to_csr_if, - VX_csr_to_issue_if csr_to_issue_if, + VX_csr_to_issue_if csr_to_issue_if, VX_csr_io_req_if csr_io_req_if, VX_csr_io_rsp_if csr_io_rsp_if, @@ -15,8 +15,8 @@ module VX_csr_unit #( VX_csr_req_if csr_req_if, VX_exu_to_cmt_if csr_commit_if ); - VX_csr_req_if csr_pipe_req_if(); - VX_exu_to_cmt_if csr_pipe_rsp_if(); + VX_csr_req_if csr_pipe_req_if(); + VX_exu_to_cmt_if csr_pipe_rsp_if(); wire select_io_req = csr_io_req_if.valid; wire select_io_rsp; diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 17c3d7dd..77a73b9f 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -347,7 +347,7 @@ module VX_decode #( assign decode_if.rd = rd; assign decode_if.rs1 = rs1_qual; assign decode_if.rs2 = rs2; - assign decode_if.rs3 = rs3; + assign decode_if.rs3 = 0; `endif assign decode_if.use_rs3 = use_rs3; diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index dcb7e8b7..9c8b19dd 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -6,11 +6,6 @@ /////////////////////////////////////////////////////////////////////////////// -// `define SYNTHESIS 1 -// `define ASIC 1 - -/////////////////////////////////////////////////////////////////////////////// - `define NW_BITS `LOG2UP(`NUM_WARPS) `define NT_BITS `LOG2UP(`NUM_THREADS) @@ -248,7 +243,7 @@ ////////////////////////// Dcache Configurable Knobs ////////////////////////// // Cache ID -`define DCACHE_ID (((`L3_ENABLE && `L2_ENABLE) ? 2 : `L2_ENABLE ? 1 : 0) + (CORE_ID * 3) + 0) +`define DCACHE_ID 32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0 // TAG sharing enable `define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE) @@ -277,7 +272,7 @@ ////////////////////////// Icache Configurable Knobs ////////////////////////// // Cache ID -`define ICACHE_ID (((`L3_ENABLE && `L2_ENABLE) ? 2 : `L2_ENABLE ? 1 : 0) + (CORE_ID * 3) + 1) +`define ICACHE_ID 32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1 // Core request address bits `define ICORE_ADDR_WIDTH (32-`CLOG2(`IWORD_SIZE)) @@ -309,7 +304,7 @@ ////////////////////////// SM Configurable Knobs ////////////////////////////// // Cache ID -`define SCACHE_ID (((`L3_ENABLE && `L2_ENABLE) ? 2 : `L2_ENABLE ? 1 : 0) + (CORE_ID * 3) + 2) +`define SCACHE_ID 32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2 // Number of Word requests per cycle {1, 2, 4, 8, ...} `define SNUM_REQUESTS `NUM_THREADS @@ -326,7 +321,7 @@ ////////////////////////// L2cache Configurable Knobs ///////////////////////// // Cache ID -`define L2CACHE_ID (`L3_ENABLE ? 1 : 0) +`define L2CACHE_ID 32'(`L3_ENABLE) + CLUSTER_ID // Core request tag bits `define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES)) diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index f542d3ce..9586bcd6 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -3,8 +3,7 @@ module VX_execute #( parameter CORE_ID = 0 ) ( - `SCOPE_SIGNALS_LSU_IO - `SCOPE_SIGNALS_EXECUTE_IO + `SCOPE_IO_VX_execute input wire clk, input wire reset, @@ -55,7 +54,7 @@ module VX_execute #( VX_lsu_unit #( .CORE_ID(CORE_ID) ) lsu_unit ( - `SCOPE_SIGNALS_LSU_BIND + `SCOPE_BIND_VX_execute_lsu_unit .clk (clk), .reset (reset), .dcache_req_if (dcache_req_if), @@ -122,6 +121,7 @@ module VX_execute #( VX_gpu_unit #( .CORE_ID(CORE_ID) ) gpu_unit ( + `SCOPE_BIND_VX_execute_gpu_unit .clk (clk), .reset (reset), .gpu_req_if (gpu_req_if), diff --git a/hw/rtl/VX_fetch.v b/hw/rtl/VX_fetch.v index cf0c2e45..fd3b1fdc 100644 --- a/hw/rtl/VX_fetch.v +++ b/hw/rtl/VX_fetch.v @@ -3,7 +3,7 @@ module VX_fetch #( parameter CORE_ID = 0 ) ( - `SCOPE_SIGNALS_ISTAGE_IO + `SCOPE_IO_VX_fetch input wire clk, input wire reset, @@ -29,6 +29,8 @@ module VX_fetch #( VX_warp_sched #( .CORE_ID(CORE_ID) ) warp_sched ( + `SCOPE_BIND_VX_fetch_warp_sched + .clk (clk), .reset (reset), .warp_ctl_if (warp_ctl_if), @@ -43,7 +45,7 @@ module VX_fetch #( VX_icache_stage #( .CORE_ID(CORE_ID) ) icache_stage ( - `SCOPE_SIGNALS_ISTAGE_BIND + `SCOPE_BIND_VX_fetch_icache_stage .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v deleted file mode 100644 index 1d7224ab..00000000 --- a/hw/rtl/VX_gpr_fp_ctrl.v +++ /dev/null @@ -1,74 +0,0 @@ -`include "VX_define.vh" - -// control module to support multi-cycle read for fp register - -module VX_gpr_fp_ctrl ( - input wire clk, - input wire reset, - - input wire [`NUM_THREADS-1:0][31:0] rs1_data, - input wire [`NUM_THREADS-1:0][31:0] rs2_data, - VX_gpr_req_if gpr_req_if, - - // outputs - output wire [`NW_BITS+`NR_BITS-1:0] raddr1, - VX_gpr_rsp_if gpr_rsp_if -); - - reg [`NUM_THREADS-1:0][31:0] rsp_rs1_data, rsp_rs2_data, rsp_rs3_data; - reg rsp_valid; - reg [31:0] rsp_pc; - reg [`NW_BITS-1:0] rsp_wid; - reg read_rs1; - - wire rs3_delay = gpr_req_if.valid && gpr_req_if.use_rs3 && read_rs1; - wire read_fire = gpr_req_if.valid && gpr_rsp_if.ready; - - always @(posedge clk) begin - if (reset) begin - rsp_valid <= 0; - rsp_pc <= 0; - rsp_rs1_data <= 0; - rsp_rs2_data <= 0; - rsp_rs3_data <= 0; - rsp_wid <= 0; - read_rs1 <= 1; - end else begin - if (rs3_delay) begin - read_rs1 <= 0; - rsp_wid <= gpr_req_if.wid; - end else if (read_fire) begin - read_rs1 <= 1; - end - - rsp_valid <= gpr_req_if.valid; - rsp_wid <= gpr_req_if.wid; - rsp_pc <= gpr_req_if.PC; - - if (read_rs1) begin - rsp_rs1_data <= rs1_data; - end - rsp_rs2_data <= rs2_data; - rsp_rs3_data <= rs1_data; - - assert(read_rs1 || rsp_wid == gpr_req_if.wid); - end - end - - always @(posedge clk) begin - - end - - // outputs - wire [`NR_BITS-1:0] rs1 = read_rs1 ? gpr_req_if.rs1 : gpr_req_if.rs3; - assign raddr1 = {gpr_req_if.wid, rs1}; - assign gpr_req_if.ready = ~rs3_delay; - - assign gpr_rsp_if.valid = rsp_valid; - assign gpr_rsp_if.wid = rsp_wid; - assign gpr_rsp_if.PC = rsp_pc; - assign gpr_rsp_if.rs1_data = rsp_rs1_data; - assign gpr_rsp_if.rs2_data = rsp_rs2_data; - assign gpr_rsp_if.rs3_data = rsp_rs3_data; - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_ram.v b/hw/rtl/VX_gpr_ram.v index 8f1b4483..05833b9d 100644 --- a/hw/rtl/VX_gpr_ram.v +++ b/hw/rtl/VX_gpr_ram.v @@ -10,136 +10,24 @@ module VX_gpr_ram ( output wire [`NUM_THREADS-1:0][31:0] rs1_data, output wire [`NUM_THREADS-1:0][31:0] rs2_data ); - `ifndef ASIC - - reg [`NUM_THREADS-1:0][3:0][7:0] ram [(`NUM_WARPS * `NUM_REGS)-1:0]; - - initial begin // initialize ram: set r0 = 0 - for (integer j = 0; j < `NUM_WARPS; j++) begin - for (integer i = 0; i < `NUM_REGS; i++) begin - ram[j * `NUM_REGS + i] = (i == 0) ? {`NUM_THREADS{32'h0}} : {`NUM_THREADS{32'hx}}; - end - end - end - - always @(posedge clk) begin - for (integer i = 0; i < `NUM_THREADS; i++) begin - if (we[i]) begin - ram[waddr][i][0] <= wdata[i][07:00]; - ram[waddr][i][1] <= wdata[i][15:08]; - ram[waddr][i][2] <= wdata[i][23:16]; - ram[waddr][i][3] <= wdata[i][31:24]; - end - end - end - - assign rs1_data = ram[rs1]; - assign rs2_data = ram[rs2]; - - `else - - wire [`NUM_THREADS-1:0][31:0] write_bit_mask; + reg [`NUM_THREADS-1:0][3:0][7:0] mem [(`NUM_WARPS * `NUM_REGS)-1:0]; + reg [`NUM_THREADS-1:0][31:0] q1, q2; + + always @(posedge clk) begin for (integer i = 0; i < `NUM_THREADS; i++) begin - assign write_bit_mask[i] = {32{~we[i]}}; - end - - wire cenb = 0; - wire cena_1 = 0; - wire cena_2 = 0; - - wire [`NUM_THREADS-1:0][31:0] tmp_a; - wire [`NUM_THREADS-1:0][31:0] tmp_b; - - `ifndef SYNTHESIS - for (integer i = 0; i < `NUM_THREADS; i++) begin - for (integer j = 0; j < 32; j++) begin - assign rs1_data[i][j] = ((tmp_a[i][j] === 1'dx) || cena_1) ? 1'b0 : tmp_a[i][j]; - assign rs2_data[i][j] = ((tmp_b[i][j] === 1'dx) || cena_2) ? 1'b0 : tmp_b[i][j]; + if (we[i]) begin + mem[waddr][i][0] <= wdata[i][07:00]; + mem[waddr][i][1] <= wdata[i][15:08]; + mem[waddr][i][2] <= wdata[i][23:16]; + mem[waddr][i][3] <= wdata[i][31:24]; end end - `else - assign rs1_data = tmp_a; - assign rs2_data = tmp_b; - `endif - for (integer i = 0; i < 'NT; i=i+4) begin - `IGNORE_WARNINGS_BEGIN - rf2_32x128_wm1 first_ram ( - .CENYA(), - .AYA(), - .CENYB(), - .WENYB(), - .AYB(), - .QA(tmp_a[(i+3):(i)]), - .SOA(), - .SOB(), - .CLKA(clk), - .CENA(cena_1), - .AA(rs1[(i+3):(i)]), - .CLKB(clk), - .CENB(cenb), - .WENB(write_bit_mask[(i+3):(i)]), - .AB(waddr[(i+3):(i)]), - .DB(wdata[(i+3):(i)]), - .EMAA(3'b011), - .EMASA(1'b0), - .EMAB(3'b011), - .TENA(1'b1), - .TCENA(1'b0), - .TAA(5'b0), - .TENB(1'b1), - .TCENB(1'b0), - .TWENB(128'b0), - .TAB(5'b0), - .TDB(128'b0), - .RET1N(1'b1), - .SIA(2'b0), - .SEA(1'b0), - .DFTRAMBYP(1'b0), - .SIB(2'b0), - .SEB(1'b0), - .COLLDISN(1'b1) - ); + q1 <= mem[rs1]; + q2 <= mem[rs2]; + end - rf2_`NUM_GPRSx128_wm1 second_ram ( - .CENYA(), - .AYA(), - .CENYB(), - .WENYB(), - .AYB(), - .QA(tmp_b[(i+3):(i)]), - .SOA(), - .SOB(), - .CLKA(clk), - .CENA(cena_2), - .AA(rs2[(i+3):(i)]), - .CLKB(clk), - .CENB(cenb), - .WENB(write_bit_mask[(i+3):(i)]), - .AB(waddr[(i+3):(i)]), - .DB(wdata[(i+3):(i)]), - .EMAA(3'b011), - .EMASA(1'b0), - .EMAB(3'b011), - .TENA(1'b1), - .TCENA(1'b0), - .TAA(5'b0), - .TENB(1'b1), - .TCENB(1'b0), - .TWENB(128'b0), - .TAB(5'b0), - .TDB(128'b0), - .RET1N(1'b1), - .SIA(2'b0), - .SEA(1'b0), - .DFTRAMBYP(1'b0), - .SIB(2'b0), - .SEB(1'b0), - .COLLDISN(1'b1) - ); - `IGNORE_WARNINGS_END - end + assign rs1_data = q1; + assign rs2_data = q2; - `endif - -endmodule +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 129da4c0..9e72c023 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -15,9 +15,15 @@ module VX_gpr_stage #( ); `UNUSED_VAR (reset) - wire [`NUM_THREADS-1:0][31:0] rs1_data; - wire [`NUM_THREADS-1:0][31:0] rs2_data; - wire [`NW_BITS+`NR_BITS-1:0] raddr1; + reg rsp_valid; + reg [`NW_BITS-1:0] rsp_wid; + reg [31:0] rsp_pc; + reg rs1_is_zero, rs2_is_zero; + + wire [`NUM_THREADS-1:0][31:0] rs1_data, rs2_data; + wire [`NW_BITS+`NR_BITS-1:0] raddr1, raddr2; + + assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2}; VX_gpr_ram gpr_ram ( .clk (clk), @@ -25,60 +31,77 @@ module VX_gpr_stage #( .waddr ({writeback_if.wid, writeback_if.rd}), .wdata (writeback_if.data), .rs1 (raddr1), - .rs2 ({gpr_req_if.wid, gpr_req_if.rs2}), + .rs2 (raddr2), .rs1_data (rs1_data), .rs2_data (rs2_data) - ); + ); -`ifdef EXT_F_ENABLE - VX_gpr_fp_ctrl VX_gpr_fp_ctrl ( - .clk (clk), - .reset (reset), - .rs1_data (rs1_data), - .rs2_data (rs2_data), - .raddr1 (raddr1), - .gpr_req_if (gpr_req_if), - .gpr_rsp_if (gpr_rsp_if) - ); -`else - reg [`NUM_THREADS-1:0][31:0] rsp_rs1_data, rsp_rs2_data; - reg rsp_valid; - reg [`NW_BITS-1:0] rsp_wid; - reg [31:0] rsp_pc; - always @(posedge clk) begin if (reset) begin - rsp_valid <= 0; - rsp_wid <= 0; - rsp_pc <= 0; - rsp_rs1_data <= 0; - rsp_rs2_data <= 0; + rsp_valid <= 0; + rsp_wid <= 0; + rsp_pc <= 0; + rs1_is_zero <= 0; + rs2_is_zero <= 0; end else begin - rsp_valid <= gpr_req_if.valid; - rsp_wid <= gpr_req_if.wid; - rsp_pc <= gpr_req_if.PC; - rsp_rs1_data <= rs1_data; - rsp_rs2_data <= rs2_data; + rsp_valid <= gpr_req_if.valid; + rsp_wid <= gpr_req_if.wid; + rsp_pc <= gpr_req_if.PC; + rs1_is_zero <= (0 == gpr_req_if.rs1); + rs2_is_zero <= (0 == gpr_req_if.rs2); end + end + +`ifdef EXT_F_ENABLE + + reg [`NUM_THREADS-1:0][31:0] rs3_data; + reg read_rs3, save_rs3; + + wire rs3_delay = gpr_req_if.valid && gpr_req_if.use_rs3 && !read_rs3; + wire read_fire = gpr_req_if.valid && gpr_rsp_if.ready; + + always @(posedge clk) begin + if (reset) begin + rs3_data <= 0; + read_rs3 <= 0; + end else begin + if (rs3_delay) begin + read_rs3 <= 1; + save_rs3 <= 1; + end else if (read_fire) begin + read_rs3 <= 0; + end + if (save_rs3) begin + rs3_data <= rs1_data; + save_rs3 <= 0; + end + assert(!read_rs3 || rsp_wid == gpr_req_if.wid); + end end + assign raddr1 = {gpr_req_if.wid, (rs3_delay ? gpr_req_if.rs3 : gpr_req_if.rs1)}; + assign gpr_req_if.ready = ~rs3_delay; + assign gpr_rsp_if.rs3_data = rs3_data; + +`else + assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1}; - assign gpr_req_if.ready = 1; - - assign gpr_rsp_if.valid = rsp_valid; - assign gpr_rsp_if.wid = rsp_wid; - assign gpr_rsp_if.PC = rsp_pc; - assign gpr_rsp_if.rs1_data = rsp_rs1_data; - assign gpr_rsp_if.rs2_data = rsp_rs2_data; assign gpr_rsp_if.rs3_data = 0; `UNUSED_VAR (gpr_req_if.valid); `UNUSED_VAR (gpr_req_if.rs3); `UNUSED_VAR (gpr_req_if.use_rs3); `UNUSED_VAR (gpr_rsp_if.ready); + `endif + + assign gpr_rsp_if.rs1_data = rs1_is_zero ? (`NUM_THREADS*32)'(0) : rs1_data; + assign gpr_rsp_if.rs2_data = rs2_is_zero ? (`NUM_THREADS*32)'(0) : rs2_data; + assign gpr_rsp_if.valid = rsp_valid; + assign gpr_rsp_if.wid = rsp_wid; + assign gpr_rsp_if.PC = rsp_pc; assign writeback_if.ready = 1'b1; -endmodule +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index ac6550a3..ffad1717 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -3,6 +3,8 @@ module VX_gpu_unit #( parameter CORE_ID = 0 ) ( + `SCOPE_IO_VX_gpu_unit + input wire clk, input wire reset, @@ -88,4 +90,18 @@ module VX_gpu_unit #( // can accept new request? assign gpu_req_if.ready = gpu_commit_if.ready; + `SCOPE_ASSIGN (scope_gpu_req_valid, gpu_req_if.valid); + `SCOPE_ASSIGN (scope_gpu_req_wid, gpu_req_if.wid); + `SCOPE_ASSIGN (scope_gpu_req_tmask, gpu_req_if.tmask); + `SCOPE_ASSIGN (scope_gpu_req_op_type, gpu_req_if.op_type); + `SCOPE_ASSIGN (scope_gpu_req_rs1, gpu_req_if.rs1_data[0]); + `SCOPE_ASSIGN (scope_gpu_req_rs2, gpu_req_if.rs2_data); + `SCOPE_ASSIGN (scope_gpu_req_ready, gpu_req_if.ready); + `SCOPE_ASSIGN (scope_gpu_rsp_valid, warp_ctl_if.valid); + `SCOPE_ASSIGN (scope_gpu_rsp_wid, warp_ctl_if.wid); + `SCOPE_ASSIGN (scope_gpu_rsp_tmc, warp_ctl_if.tmc); + `SCOPE_ASSIGN (scope_gpu_rsp_wspawn, warp_ctl_if.wspawn); + `SCOPE_ASSIGN (scope_gpu_rsp_split, warp_ctl_if.split); + `SCOPE_ASSIGN (scope_gpu_rsp_barrier, warp_ctl_if.barrier); + endmodule \ No newline at end of file diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 3f891a45..b3e6fd9d 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -20,15 +20,12 @@ module VX_ibuffer #( localparam ADDRW = $clog2(SIZE); localparam NWARPSW = $clog2(`NUM_WARPS+1); - `USE_FAST_BRAM reg [DATAW-1:0] entries [`NUM_WARPS-1:0][SIZE-1:0]; reg [SIZEW-1:0] size_r [`NUM_WARPS-1:0]; - reg [ADDRW:0] rd_ptr_r [`NUM_WARPS-1:0]; - reg [ADDRW:0] wr_ptr_r [`NUM_WARPS-1:0]; - + wire [`NUM_WARPS-1:0] q_full; wire [`NUM_WARPS-1:0][SIZEW-1:0] q_size; wire [DATAW-1:0] q_data_in; - wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev; + wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev; reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out; wire enq_fire = ibuf_enq_if.valid && ibuf_enq_if.ready; @@ -39,21 +36,33 @@ module VX_ibuffer #( wire writing = enq_fire && (i == ibuf_enq_if.wid); wire reading = deq_fire && (i == ibuf_deq_if.wid); - wire [ADDRW-1:0] rd_ptr_a = rd_ptr_r[i][ADDRW-1:0]; - wire [ADDRW-1:0] wr_ptr_a = wr_ptr_r[i][ADDRW-1:0]; - + wire is_slot0 = (0 == size_r[i]) || ((1 == size_r[i]) && reading); + + wire push = writing && !is_slot0; + wire pop = reading && (size_r[i] != 1); + + VX_generic_queue #( + .DATAW(DATAW), + .SIZE(SIZE) + ) queue ( + .clk (clk), + .reset (reset), + .push (push), + .pop (pop), + .data_in (q_data_in), + .data_out (q_data_prev[i]), + `UNUSED_PIN (empty), + `UNUSED_PIN (full), + `UNUSED_PIN (size) + ); + always @(posedge clk) begin if (reset) begin - rd_ptr_r[i] <= 0; - wr_ptr_r[i] <= 0; - size_r[i] <= 0; - end else begin - if (writing) begin - if ((0 == size_r[i]) || ((1 == size_r[i]) && reading)) begin + size_r[i] <= 0; + end else begin + if (writing) begin + if (is_slot0) begin q_data_out[i] <= q_data_in; - end else begin - entries[i][wr_ptr_a] <= q_data_in; - wr_ptr_r[i] <= wr_ptr_r[i] + ADDRW'(1); end if (!reading) begin size_r[i] <= size_r[i] + SIZEW'(1); @@ -62,18 +71,16 @@ module VX_ibuffer #( if (reading) begin if (size_r[i] != 1) begin q_data_out[i] <= q_data_prev[i]; - rd_ptr_r[i] <= rd_ptr_r[i] + ADDRW'(1); end if (!writing) begin size_r[i] <= size_r[i] - SIZEW'(1); end end end - end - - assign q_data_prev[i] = entries[i][rd_ptr_a]; - assign q_full[i] = (size_r[i] == SIZE); - assign q_size[i] = size_r[i]; + end + + assign q_full[i] = (size_r[i] == SIZE); + assign q_size[i] = size_r[i]; end /////////////////////////////////////////////////////////////////////////// @@ -144,9 +151,9 @@ module VX_ibuffer #( schedule_table[deq_wid_n] <= 0; end - deq_valid <= deq_valid_n; - deq_wid <= deq_wid_n; - deq_instr <= deq_instr_n; + deq_valid <= deq_valid_n; + deq_wid <= deq_wid_n; + deq_instr <= deq_instr_n; if (warp_added && !warp_removed) begin num_warps <= num_warps + NWARPSW'(1); diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index 1bb61f92..6bb52123 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -3,7 +3,7 @@ module VX_icache_stage #( parameter CORE_ID = 0 ) ( - `SCOPE_SIGNALS_ISTAGE_IO + `SCOPE_IO_VX_icache_stage input wire clk, input wire reset, @@ -20,8 +20,8 @@ module VX_icache_stage #( ); `UNUSED_VAR (reset) - reg [31:0] rsp_PC_buf [`NUM_WARPS-1:0]; - reg [`NUM_THREADS-1:0] rsp_tmask_buf [`NUM_WARPS-1:0]; + `NO_RW_RAM_CHECK reg [31:0] rsp_PC_buf [`NUM_WARPS-1:0]; + `NO_RW_RAM_CHECK reg [`NUM_THREADS-1:0] rsp_tmask_buf [`NUM_WARPS-1:0]; wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; @@ -29,8 +29,8 @@ module VX_icache_stage #( wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[0][`NW_BITS-1:0]; always @(posedge clk) begin - if (icache_req_fire) begin - rsp_PC_buf[req_tag] <= ifetch_req_if.PC; + if (icache_req_fire) begin + rsp_PC_buf[req_tag] <= ifetch_req_if.PC; rsp_tmask_buf[req_tag] <= ifetch_req_if.tmask; end end diff --git a/hw/rtl/VX_ipdom_stack.v b/hw/rtl/VX_ipdom_stack.v index f388d3d0..4e7d42f9 100644 --- a/hw/rtl/VX_ipdom_stack.v +++ b/hw/rtl/VX_ipdom_stack.v @@ -1,4 +1,3 @@ - `include "VX_platform.vh" module VX_ipdom_stack #( @@ -17,33 +16,55 @@ module VX_ipdom_stack #( ); localparam STACK_SIZE = 2 ** DEPTH; - `USE_FAST_BRAM reg [WIDTH-1:0] stack_1 [0:STACK_SIZE-1]; - `USE_FAST_BRAM reg [WIDTH-1:0] stack_2 [0:STACK_SIZE-1]; - `USE_FAST_BRAM reg is_part [0:STACK_SIZE-1]; + `NO_RW_RAM_CHECK reg [WIDTH-1:0] stack_1 [0:STACK_SIZE-1]; + `NO_RW_RAM_CHECK reg [WIDTH-1:0] stack_2 [0:STACK_SIZE-1]; + reg is_part [0:STACK_SIZE-1]; reg [DEPTH-1:0] rd_ptr, wr_ptr; + reg [WIDTH - 1:0] d1, d2; + reg p; + always @(posedge clk) begin if (reset) begin + rd_ptr <= 0; wr_ptr <= 0; end else begin if (push) begin - stack_1[wr_ptr] <= q1; - stack_2[wr_ptr] <= q2; - is_part[wr_ptr] <= 0; rd_ptr <= wr_ptr; wr_ptr <= wr_ptr + DEPTH'(1); end else if (pop) begin wr_ptr <= wr_ptr - DEPTH'(is_part[rd_ptr]); rd_ptr <= rd_ptr - DEPTH'(is_part[rd_ptr]); - is_part[rd_ptr] <= 1; end end end - assign d = is_part[rd_ptr] ? stack_1[rd_ptr] : stack_2[rd_ptr]; + always @(posedge clk) begin + if (push) begin + stack_1[wr_ptr] <= q1; + end + end + assign d1 = stack_1[rd_ptr]; - assign empty = (0 == wr_ptr); + always @(posedge clk) begin + if (push) begin + stack_2[wr_ptr] <= q2; + end + end + assign d2 = stack_2[rd_ptr]; + + always @(posedge clk) begin + if (push) begin + is_part[wr_ptr] <= 0; + end else if (pop) begin + is_part[rd_ptr] <= 1; + end + end + assign p = is_part[rd_ptr]; + + assign d = p ? d1 : d2; + assign empty = ~(| wr_ptr); assign full = ((STACK_SIZE-1) == wr_ptr); endmodule \ No newline at end of file diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 1c1e4f8a..46e0388f 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -3,7 +3,7 @@ module VX_issue #( parameter CORE_ID = 0 ) ( - `SCOPE_SIGNALS_ISSUE_IO + `SCOPE_IO_VX_issue input wire clk, input wire reset, diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 797b30cd..52646138 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -3,7 +3,7 @@ module VX_lsu_unit #( parameter CORE_ID = 0 ) ( - `SCOPE_SIGNALS_LSU_IO + `SCOPE_IO_VX_lsu_unit input wire clk, input wire reset, diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 5d5c7928..d7a4ffd7 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -3,7 +3,7 @@ module VX_mem_unit # ( parameter CORE_ID = 0 ) ( - `SCOPE_SIGNALS_CACHE_IO + `SCOPE_IO_VX_mem_unit input wire clk, input wire reset, @@ -77,7 +77,7 @@ module VX_mem_unit # ( .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), .DRAM_TAG_WIDTH (`SDRAM_TAG_WIDTH) ) smem ( - `SCOPE_SIGNALS_CACHE_UNBIND + `SCOPE_BIND_VX_mem_unit_smem .clk (clk), .reset (reset), @@ -104,7 +104,7 @@ module VX_mem_unit # ( `UNUSED_PIN (dram_req_addr), `UNUSED_PIN (dram_req_data), `UNUSED_PIN (dram_req_tag), - .dram_req_ready (0), + .dram_req_ready (1'b0), // DRAM response .dram_rsp_valid (0), @@ -113,7 +113,7 @@ module VX_mem_unit # ( `UNUSED_PIN (dram_rsp_ready), // Snoop request - .snp_req_valid (0), + .snp_req_valid (1'b0), .snp_req_addr (0), .snp_req_invalidate (0), .snp_req_tag (0), @@ -122,17 +122,17 @@ module VX_mem_unit # ( // Snoop response `UNUSED_PIN (snp_rsp_valid), `UNUSED_PIN (snp_rsp_tag), - .snp_rsp_ready (0), + .snp_rsp_ready (1'b0), // Snoop forward out `UNUSED_PIN (snp_fwdout_valid), `UNUSED_PIN (snp_fwdout_addr), `UNUSED_PIN (snp_fwdout_invalidate), `UNUSED_PIN (snp_fwdout_tag), - .snp_fwdout_ready (0), + .snp_fwdout_ready (1'b0), // Snoop forward in - .snp_fwdin_valid (0), + .snp_fwdin_valid (1'b0), .snp_fwdin_tag (0), `UNUSED_PIN (snp_fwdin_ready) ); @@ -159,7 +159,7 @@ module VX_mem_unit # ( .DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH), .SNP_REQ_TAG_WIDTH (`DSNP_TAG_WIDTH) ) dcache ( - `SCOPE_SIGNALS_CACHE_BIND + `SCOPE_BIND_VX_mem_unit_dcache .clk (clk), .reset (reset), @@ -211,10 +211,10 @@ module VX_mem_unit # ( `UNUSED_PIN (snp_fwdout_addr), `UNUSED_PIN (snp_fwdout_invalidate), `UNUSED_PIN (snp_fwdout_tag), - .snp_fwdout_ready (0), + .snp_fwdout_ready (1'b0), // Snoop forward in - .snp_fwdin_valid (0), + .snp_fwdin_valid (1'b0), .snp_fwdin_tag (0), `UNUSED_PIN (snp_fwdin_ready) ); @@ -240,7 +240,7 @@ module VX_mem_unit # ( .CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS), .DRAM_TAG_WIDTH (`IDRAM_TAG_WIDTH) ) icache ( - `SCOPE_SIGNALS_CACHE_UNBIND + `SCOPE_BIND_VX_mem_unit_icache .clk (clk), .reset (reset), @@ -276,26 +276,26 @@ module VX_mem_unit # ( .dram_rsp_ready (icache_dram_rsp_if.ready), // Snoop request - .snp_req_valid (0), + .snp_req_valid (1'b0), .snp_req_addr (0), - .snp_req_invalidate (0), + .snp_req_invalidate (1'b0), .snp_req_tag (0), `UNUSED_PIN (snp_req_ready), // Snoop response `UNUSED_PIN (snp_rsp_valid), `UNUSED_PIN (snp_rsp_tag), - .snp_rsp_ready (0), + .snp_rsp_ready (1'b0), // Snoop forward out `UNUSED_PIN (snp_fwdout_valid), `UNUSED_PIN (snp_fwdout_addr), `UNUSED_PIN (snp_fwdout_invalidate), `UNUSED_PIN (snp_fwdout_tag), - .snp_fwdout_ready (0), + .snp_fwdout_ready (1'b0), // Snoop forward in - .snp_fwdin_valid (0), + .snp_fwdin_valid (1'b0), .snp_fwdin_tag (0), `UNUSED_PIN (snp_fwdin_ready) ); diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index c2629eec..b6a5444c 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -3,10 +3,7 @@ module VX_pipeline #( parameter CORE_ID = 0 ) ( - `SCOPE_SIGNALS_ISTAGE_IO - `SCOPE_SIGNALS_LSU_IO - `SCOPE_SIGNALS_ISSUE_IO - `SCOPE_SIGNALS_EXECUTE_IO + `SCOPE_IO_VX_pipeline // Clock input wire clk, @@ -126,7 +123,7 @@ module VX_pipeline #( VX_fetch #( .CORE_ID(CORE_ID) ) fetch ( - `SCOPE_SIGNALS_ISTAGE_BIND + `SCOPE_BIND_VX_pipeline_fetch .clk (clk), .reset (reset), .icache_req_if (core_icache_req_if), @@ -153,7 +150,7 @@ module VX_pipeline #( VX_issue #( .CORE_ID(CORE_ID) ) issue ( - `SCOPE_SIGNALS_ISSUE_BIND + `SCOPE_BIND_VX_pipeline_issue .clk (clk), .reset (reset), @@ -173,8 +170,8 @@ module VX_pipeline #( VX_execute #( .CORE_ID(CORE_ID) ) execute ( - `SCOPE_SIGNALS_LSU_BIND - `SCOPE_SIGNALS_EXECUTE_BIND + `SCOPE_BIND_VX_pipeline_execute + .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 3f158408..783743ee 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -22,14 +22,16 @@ /* verilator lint_off WIDTH */ \ /* verilator lint_off UNOPTFLAT */ \ /* verilator lint_off UNDRIVEN */ \ - /* verilator lint_off DECLFILENAME */ + /* verilator lint_off DECLFILENAME */ \ + /* verilator lint_off IMPLICIT */ `define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \ /* verilator lint_on PINCONNECTEMPTY */ \ /* verilator lint_on WIDTH */ \ /* verilator lint_on UNOPTFLAT */ \ /* verilator lint_on UNDRIVEN */ \ - /* verilator lint_on DECLFILENAME */ + /* verilator lint_on DECLFILENAME */ \ + /* verilator lint_on IMPLICIT */ `define UNUSED_VAR(x) always @(x) begin end @@ -39,9 +41,9 @@ `define STRINGIFY(x) `"x`" -`define STATIC_ASSERT(cond, msg) \ - generate \ - if (!(cond)) $error(msg); \ +`define STATIC_ASSERT(cond, msg) \ + generate \ + if (!(cond)) $error msg; \ endgenerate `define ENABLE_TRACING /* verilator tracing_on */ @@ -49,8 +51,8 @@ /////////////////////////////////////////////////////////////////////////////// -`define USE_FAST_BRAM (* syn_ramstyle = "mlab" *) -`define RELAX_BRAM_RW (* syn_ramstyle = "no_rw_check" *) +`define USE_FAST_BRAM (* ramstyle="mlab" *) +`define NO_RW_RAM_CHECK (* ramstyle="mlab, no_rw_check" *) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_scope.vh b/hw/rtl/VX_scope.vh index fa595c56..c4552d21 100644 --- a/hw/rtl/VX_scope.vh +++ b/hw/rtl/VX_scope.vh @@ -3,398 +3,85 @@ `ifdef SCOPE -`define SCOPE_SIGNALS_DATA_LIST \ - scope_dram_req_addr, \ - scope_dram_req_rw, \ - scope_dram_req_byteen, \ - scope_dram_req_data, \ - scope_dram_req_tag, \ - scope_dram_rsp_data, \ - scope_dram_rsp_tag, \ - scope_snp_req_addr, \ - scope_snp_req_invalidate, \ - scope_snp_req_tag, \ - scope_snp_rsp_tag, \ - scope_icache_req_wid, \ - scope_icache_req_addr, \ - scope_icache_req_tag, \ - scope_icache_rsp_data, \ - scope_icache_rsp_tag, \ - scope_dcache_req_wid, \ - scope_dcache_req_pc, \ - scope_dcache_req_addr, \ - scope_dcache_req_rw, \ - scope_dcache_req_byteen, \ - scope_dcache_req_data, \ - scope_dcache_req_tag, \ - scope_dcache_rsp_data, \ - scope_dcache_rsp_tag, \ - scope_issue_wid, \ - scope_issue_tmask, \ - scope_issue_pc, \ - scope_issue_ex_type, \ - scope_issue_op_type, \ - scope_issue_op_mod, \ - scope_issue_wb, \ - scope_issue_rd, \ - scope_issue_rs1, \ - scope_issue_rs2, \ - scope_issue_rs3, \ - scope_issue_imm, \ - scope_issue_rs1_is_pc, \ - scope_issue_rs2_is_imm, \ - scope_gpr_rsp_wid, \ - scope_gpr_rsp_pc, \ - scope_gpr_rsp_a, \ - scope_gpr_rsp_b, \ - scope_gpr_rsp_c, \ - scope_writeback_wid, \ - scope_writeback_pc, \ - scope_writeback_rd, \ - scope_writeback_data, \ - scope_bank_addr_st0, \ - scope_bank_addr_st1, \ - scope_bank_addr_st2, \ - scope_bank_is_mrvq_st1, \ - scope_bank_miss_st1, \ - scope_bank_dirty_st1, \ - scope_bank_force_miss_st1, - - `define SCOPE_SIGNALS_UPD_LIST \ - scope_dram_req_valid, \ - scope_dram_req_ready, \ - scope_dram_rsp_valid, \ - scope_dram_rsp_ready, \ - scope_snp_req_valid, \ - scope_snp_req_ready, \ - scope_snp_rsp_valid, \ - scope_snp_rsp_ready, \ - scope_icache_req_valid, \ - scope_icache_req_ready, \ - scope_icache_rsp_valid, \ - scope_icache_rsp_ready, \ - scope_dcache_req_valid, \ - scope_dcache_req_ready, \ - scope_dcache_rsp_valid, \ - scope_dcache_rsp_ready, \ - scope_bank_valid_st0, \ - scope_bank_valid_st1, \ - scope_bank_valid_st2, \ - scope_bank_stall_pipe, \ - scope_issue_valid, \ - scope_issue_ready, \ - scope_gpr_rsp_valid, \ - scope_writeback_valid, \ - scope_scoreboard_delay, \ - scope_gpr_delay, \ - scope_execute_delay, \ - scope_busy +`include "scope-defs.vh" - `define SCOPE_SIGNALS_DECL \ - wire scope_dram_req_valid; \ - wire [31:0] scope_dram_req_addr; \ - wire scope_dram_req_rw; \ - wire [15:0] scope_dram_req_byteen; \ - wire [127:0] scope_dram_req_data; \ - wire [`VX_DRAM_TAG_WIDTH-1:0] scope_dram_req_tag; \ - wire scope_dram_req_ready; \ - wire scope_dram_rsp_valid; \ - wire [127:0] scope_dram_rsp_data; \ - wire [`VX_DRAM_TAG_WIDTH-1:0] scope_dram_rsp_tag; \ - wire scope_dram_rsp_ready; \ - wire scope_snp_req_valid; \ - wire [31:0] scope_snp_req_addr; \ - wire scope_snp_req_invalidate; \ - wire [`VX_SNP_TAG_WIDTH-1:0] scope_snp_req_tag; \ - wire scope_snp_req_ready; \ - wire scope_snp_rsp_valid; \ - wire [`VX_SNP_TAG_WIDTH-1:0] scope_snp_rsp_tag; \ - wire scope_icache_req_valid; \ - wire [`NW_BITS-1:0] scope_icache_req_wid; \ - wire [31:0] scope_icache_req_addr; \ - wire [`ICORE_TAG_ID_BITS-1:0] scope_icache_req_tag; \ - wire scope_icache_req_ready; \ - wire scope_icache_rsp_valid; \ - wire [31:0] scope_icache_rsp_data; \ - wire [`ICORE_TAG_ID_BITS-1:0] scope_icache_rsp_tag; \ - wire scope_icache_rsp_ready; \ - wire [`NUM_THREADS-1:0] scope_dcache_req_valid; \ - wire [`NW_BITS-1:0] scope_dcache_req_wid; \ - wire [31:0] scope_dcache_req_pc; \ - wire [`NUM_THREADS-1:0][31:0] scope_dcache_req_addr; \ - wire scope_dcache_req_rw; \ - wire [`NUM_THREADS-1:0][3:0] scope_dcache_req_byteen; \ - wire [`NUM_THREADS-1:0][31:0] scope_dcache_req_data; \ - wire [`DCORE_TAG_ID_BITS-1:0] scope_dcache_req_tag; \ - wire scope_dcache_req_ready; \ - wire [`NUM_THREADS-1:0] scope_dcache_rsp_valid; \ - wire [`NUM_THREADS-1:0][31:0] scope_dcache_rsp_data; \ - wire [`DCORE_TAG_ID_BITS-1:0] scope_dcache_rsp_tag; \ - wire scope_dcache_rsp_ready; \ - wire scope_snp_rsp_ready; \ - wire [`NW_BITS-1:0] scope_issue_wid; \ - wire [`NUM_THREADS-1:0] scope_issue_tmask; \ - wire [31:0] scope_issue_pc; \ - wire [`EX_BITS-1:0] scope_issue_ex_type; \ - wire [`OP_BITS-1:0] scope_issue_op_type; \ - wire [`MOD_BITS-1:0] scope_issue_op_mod; \ - wire scope_issue_wb; \ - wire [`NR_BITS-1:0] scope_issue_rd; \ - wire [`NR_BITS-1:0] scope_issue_rs1; \ - wire [`NR_BITS-1:0] scope_issue_rs2; \ - wire [`NR_BITS-1:0] scope_issue_rs3; \ - wire [31:0] scope_issue_imm; \ - wire scope_issue_rs1_is_pc; \ - wire scope_issue_rs2_is_imm; \ - wire scope_gpr_rsp_valid; \ - wire [`NW_BITS-1:0] scope_gpr_rsp_wid; \ - wire [31:0] scope_gpr_rsp_pc; \ - wire [`NUM_THREADS-1:0][31:0] scope_gpr_rsp_a; \ - wire [`NUM_THREADS-1:0][31:0] scope_gpr_rsp_b; \ - wire [`NUM_THREADS-1:0][31:0] scope_gpr_rsp_c; \ - wire scope_writeback_valid; \ - wire [`NW_BITS-1:0] scope_writeback_wid; \ - wire [31:0] scope_writeback_pc; \ - wire [`NR_BITS-1:0] scope_writeback_rd; \ - wire [`NUM_THREADS-1:0][31:0] scope_writeback_data; \ - wire scope_bank_valid_st0; \ - wire scope_bank_valid_st1; \ - wire scope_bank_valid_st2; \ - wire [31:0] scope_bank_addr_st0; \ - wire [31:0] scope_bank_addr_st1; \ - wire [31:0] scope_bank_addr_st2; \ - wire scope_bank_is_mrvq_st1; \ - wire scope_bank_miss_st1; \ - wire scope_bank_dirty_st1; \ - wire scope_bank_force_miss_st1; \ - wire scope_bank_stall_pipe; \ - wire scope_issue_valid; \ - wire scope_issue_ready; \ - wire scope_scoreboard_delay; \ - wire scope_gpr_delay; \ - wire scope_execute_delay; \ - wire scope_busy; +`define SCOPE_ASSIGN(d,s) assign d = s - `define SCOPE_SIGNALS_ISTAGE_IO \ - output wire scope_icache_req_valid, \ - output wire [`NW_BITS-1:0] scope_icache_req_wid, \ - output wire [31:0] scope_icache_req_addr, \ - output wire [`ICORE_TAG_ID_BITS-1:0] scope_icache_req_tag, \ - output wire scope_icache_req_ready, \ - output wire scope_icache_rsp_valid, \ - output wire [31:0] scope_icache_rsp_data, \ - output wire [`ICORE_TAG_ID_BITS-1:0] scope_icache_rsp_tag, \ - output wire scope_icache_rsp_ready, - - `define SCOPE_SIGNALS_LSU_IO \ - output wire [`NUM_THREADS-1:0] scope_dcache_req_valid, \ - output wire [`NW_BITS-1:0] scope_dcache_req_wid, \ - output wire [31:0] scope_dcache_req_pc, \ - output wire [`NUM_THREADS-1:0][31:0] scope_dcache_req_addr, \ - output wire scope_dcache_req_rw, \ - output wire [`NUM_THREADS-1:0][3:0] scope_dcache_req_byteen, \ - output wire [`NUM_THREADS-1:0][31:0] scope_dcache_req_data, \ - output wire [`DCORE_TAG_ID_BITS-1:0] scope_dcache_req_tag, \ - output wire scope_dcache_req_ready, \ - output wire [`NUM_THREADS-1:0] scope_dcache_rsp_valid, \ - output wire [`NUM_THREADS-1:0][31:0] scope_dcache_rsp_data, \ - output wire [`DCORE_TAG_ID_BITS-1:0] scope_dcache_rsp_tag, \ - output wire scope_dcache_rsp_ready, - - `define SCOPE_SIGNALS_CACHE_IO \ - output wire scope_bank_valid_st0, \ - output wire scope_bank_valid_st1, \ - output wire scope_bank_valid_st2, \ - output wire [31:0] scope_bank_addr_st0, \ - output wire [31:0] scope_bank_addr_st1, \ - output wire [31:0] scope_bank_addr_st2, \ - output wire scope_bank_is_mrvq_st1, \ - output wire scope_bank_miss_st1, \ - output wire scope_bank_dirty_st1, \ - output wire scope_bank_force_miss_st1, \ - output wire scope_bank_stall_pipe, - - `define SCOPE_SIGNALS_ISSUE_IO \ - output wire scope_issue_valid, \ - output wire [`NW_BITS-1:0] scope_issue_wid, \ - output wire [`NUM_THREADS-1:0] scope_issue_tmask, \ - output wire [31:0] scope_issue_pc, \ - output wire [`EX_BITS-1:0] scope_issue_ex_type, \ - output wire [`OP_BITS-1:0] scope_issue_op_type, \ - output wire [`MOD_BITS-1:0] scope_issue_op_mod, \ - output wire scope_issue_wb, \ - output wire [`NR_BITS-1:0] scope_issue_rd, \ - output wire [`NR_BITS-1:0] scope_issue_rs1, \ - output wire [`NR_BITS-1:0] scope_issue_rs2, \ - output wire [`NR_BITS-1:0] scope_issue_rs3, \ - output wire [31:0] scope_issue_imm, \ - output wire scope_issue_rs1_is_pc, \ - output wire scope_issue_rs2_is_imm, \ - output wire scope_writeback_valid, \ - output wire scope_gpr_rsp_valid, \ - output wire [`NW_BITS-1:0] scope_gpr_rsp_wid, \ - output wire [31:0] scope_gpr_rsp_pc, \ - output wire [`NUM_THREADS-1:0][31:0] scope_gpr_rsp_a, \ - output wire [`NUM_THREADS-1:0][31:0] scope_gpr_rsp_b, \ - output wire [`NUM_THREADS-1:0][31:0] scope_gpr_rsp_c, \ - output wire [`NW_BITS-1:0] scope_writeback_wid, \ - output wire [31:0] scope_writeback_pc, \ - output wire [`NR_BITS-1:0] scope_writeback_rd, \ - output wire [`NUM_THREADS-1:0][31:0] scope_writeback_data, \ - output wire scope_issue_ready, \ - output wire scope_scoreboard_delay, \ - output wire scope_gpr_delay, \ - output wire scope_execute_delay, - - `define SCOPE_SIGNALS_EXECUTE_IO - - `define SCOPE_SIGNALS_ISTAGE_BIND \ - .scope_icache_req_valid (scope_icache_req_valid), \ - .scope_icache_req_wid (scope_icache_req_wid), \ - .scope_icache_req_addr (scope_icache_req_addr), \ - .scope_icache_req_tag (scope_icache_req_tag), \ - .scope_icache_req_ready (scope_icache_req_ready), \ - .scope_icache_rsp_valid (scope_icache_rsp_valid), \ - .scope_icache_rsp_data (scope_icache_rsp_data), \ - .scope_icache_rsp_tag (scope_icache_rsp_tag), \ - .scope_icache_rsp_ready (scope_icache_rsp_ready), - - `define SCOPE_SIGNALS_LSU_BIND \ - .scope_dcache_req_valid (scope_dcache_req_valid), \ - .scope_dcache_req_wid (scope_dcache_req_wid), \ - .scope_dcache_req_pc (scope_dcache_req_pc), \ - .scope_dcache_req_addr (scope_dcache_req_addr), \ - .scope_dcache_req_rw (scope_dcache_req_rw), \ - .scope_dcache_req_byteen(scope_dcache_req_byteen), \ - .scope_dcache_req_data (scope_dcache_req_data), \ - .scope_dcache_req_tag (scope_dcache_req_tag), \ - .scope_dcache_req_ready (scope_dcache_req_ready), \ - .scope_dcache_rsp_valid (scope_dcache_rsp_valid), \ - .scope_dcache_rsp_data (scope_dcache_rsp_data), \ - .scope_dcache_rsp_tag (scope_dcache_rsp_tag), \ - .scope_dcache_rsp_ready (scope_dcache_rsp_ready), - - `define SCOPE_SIGNALS_CACHE_BIND \ - .scope_bank_valid_st0 (scope_bank_valid_st0), \ - .scope_bank_valid_st1 (scope_bank_valid_st1), \ - .scope_bank_valid_st2 (scope_bank_valid_st2), \ - .scope_bank_addr_st0 (scope_bank_addr_st0), \ - .scope_bank_addr_st1 (scope_bank_addr_st1), \ - .scope_bank_addr_st2 (scope_bank_addr_st2), \ - .scope_bank_is_mrvq_st1 (scope_bank_is_mrvq_st1), \ - .scope_bank_miss_st1 (scope_bank_miss_st1), \ - .scope_bank_dirty_st1 (scope_bank_dirty_st1), \ - .scope_bank_force_miss_st1(scope_bank_force_miss_st1), \ - .scope_bank_stall_pipe (scope_bank_stall_pipe), - - `define SCOPE_SIGNALS_CACHE_UNBIND \ - /* verilator lint_off PINCONNECTEMPTY */ \ - .scope_bank_valid_st0 (), \ - .scope_bank_valid_st1 (), \ - .scope_bank_valid_st2 (), \ - .scope_bank_addr_st0 (), \ - .scope_bank_addr_st1 (), \ - .scope_bank_addr_st2 (), \ - .scope_bank_is_mrvq_st1 (), \ - .scope_bank_miss_st1 (), \ - .scope_bank_dirty_st1 (), \ - .scope_bank_force_miss_st1 (), \ - .scope_bank_stall_pipe (), \ - /* verilator lint_on PINCONNECTEMPTY */ - - `define SCOPE_SIGNALS_CACHE_BANK_SELECT \ - /* verilator lint_off UNUSED */ \ - wire [NUM_BANKS-1:0] scope_per_bank_valid_st0; \ - wire [NUM_BANKS-1:0] scope_per_bank_valid_st1; \ - wire [NUM_BANKS-1:0] scope_per_bank_valid_st2; \ - wire [NUM_BANKS-1:0][31:0] scope_per_bank_addr_st0; \ - wire [NUM_BANKS-1:0][31:0] scope_per_bank_addr_st1; \ - wire [NUM_BANKS-1:0][31:0] scope_per_bank_addr_st2; \ - wire [NUM_BANKS-1:0] scope_per_bank_is_mrvq_st1; \ - wire [NUM_BANKS-1:0] scope_per_bank_miss_st1; \ - wire [NUM_BANKS-1:0] scope_per_bank_dirty_st1; \ - wire [NUM_BANKS-1:0] scope_per_bank_force_miss_st1; \ - wire [NUM_BANKS-1:0] scope_per_bank_stall_pipe; \ - /* verilator lint_on UNUSED */ \ - assign scope_bank_valid_st0 = scope_per_bank_valid_st0[0]; \ - assign scope_bank_valid_st1 = scope_per_bank_valid_st1[0]; \ - assign scope_bank_valid_st2 = scope_per_bank_valid_st2[0]; \ - assign scope_bank_addr_st0 = scope_per_bank_addr_st0[0]; \ - assign scope_bank_addr_st1 = scope_per_bank_addr_st1[0]; \ - assign scope_bank_addr_st2 = scope_per_bank_addr_st2[0]; \ - assign scope_bank_is_mrvq_st1 = scope_per_bank_is_mrvq_st1[0]; \ - assign scope_bank_miss_st1 = scope_per_bank_miss_st1[0]; \ - assign scope_bank_dirty_st1 = scope_per_bank_dirty_st1[0]; \ - assign scope_bank_force_miss_st1 = scope_per_bank_force_miss_st1[0]; \ - assign scope_bank_stall_pipe = scope_per_bank_stall_pipe[0]; - - `define SCOPE_SIGNALS_CACHE_BANK_BIND \ - .scope_bank_valid_st0 (scope_per_bank_valid_st0[i]), \ - .scope_bank_valid_st1 (scope_per_bank_valid_st1[i]), \ - .scope_bank_valid_st2 (scope_per_bank_valid_st2[i]), \ - .scope_bank_addr_st0 (scope_per_bank_addr_st0[i]), \ - .scope_bank_addr_st1 (scope_per_bank_addr_st1[i]), \ - .scope_bank_addr_st2 (scope_per_bank_addr_st2[i]), \ - .scope_bank_is_mrvq_st1 (scope_per_bank_is_mrvq_st1[i]), \ - .scope_bank_miss_st1 (scope_per_bank_miss_st1[i]), \ - .scope_bank_dirty_st1 (scope_per_bank_dirty_st1[i]), \ - .scope_bank_force_miss_st1 (scope_per_bank_force_miss_st1[i]), \ - .scope_bank_stall_pipe (scope_per_bank_stall_pipe[i]), - - `define SCOPE_SIGNALS_ISSUE_BIND \ - .scope_issue_valid (scope_issue_valid), \ - .scope_issue_wid (scope_issue_wid), \ - .scope_issue_tmask (scope_issue_tmask), \ - .scope_issue_pc (scope_issue_pc), \ - .scope_issue_ex_type (scope_issue_ex_type), \ - .scope_issue_op_type (scope_issue_op_type), \ - .scope_issue_op_mod (scope_issue_op_mod), \ - .scope_issue_wb (scope_issue_wb), \ - .scope_issue_rd (scope_issue_rd), \ - .scope_issue_rs1 (scope_issue_rs1), \ - .scope_issue_rs2 (scope_issue_rs2), \ - .scope_issue_rs3 (scope_issue_rs3), \ - .scope_issue_imm (scope_issue_imm), \ - .scope_issue_rs1_is_pc (scope_issue_rs1_is_pc), \ - .scope_issue_rs2_is_imm (scope_issue_rs2_is_imm), \ - .scope_writeback_valid (scope_writeback_valid), \ - .scope_writeback_wid (scope_writeback_wid), \ - .scope_writeback_pc (scope_writeback_pc), \ - .scope_writeback_rd (scope_writeback_rd), \ - .scope_writeback_data (scope_writeback_data), \ - .scope_issue_ready (scope_issue_ready), \ - .scope_gpr_rsp_valid (scope_gpr_rsp_valid), \ - .scope_gpr_rsp_wid (scope_gpr_rsp_wid), \ - .scope_gpr_rsp_pc (scope_gpr_rsp_pc), \ - .scope_gpr_rsp_a (scope_gpr_rsp_a), \ - .scope_gpr_rsp_b (scope_gpr_rsp_b), \ - .scope_gpr_rsp_c (scope_gpr_rsp_c), \ - .scope_scoreboard_delay (scope_scoreboard_delay), \ - .scope_gpr_delay (scope_gpr_delay), \ - .scope_execute_delay (scope_execute_delay), \ - - `define SCOPE_SIGNALS_EXECUTE_BIND - - `define SCOPE_ASSIGN(d,s) assign d = s `else - `define SCOPE_SIGNALS_ISTAGE_IO - `define SCOPE_SIGNALS_LSU_IO - `define SCOPE_SIGNALS_CACHE_IO - `define SCOPE_SIGNALS_ISSUE_IO - `define SCOPE_SIGNALS_EXECUTE_IO - `define SCOPE_SIGNALS_ISTAGE_BIND - `define SCOPE_SIGNALS_LSU_BIND - `define SCOPE_SIGNALS_CACHE_BIND - `define SCOPE_SIGNALS_ISSUE_BIND - `define SCOPE_SIGNALS_EXECUTE_BIND - - `define SCOPE_SIGNALS_CACHE_UNBIND - `define SCOPE_SIGNALS_CACHE_BANK_SELECT - `define SCOPE_SIGNALS_CACHE_BANK_BIND - - `define SCOPE_ASSIGN(d,s) +`define SCOPE_IO_VX_icache_stage + +`define SCOPE_IO_VX_fetch + +`define SCOPE_BIND_VX_fetch_icache_stage + +`define SCOPE_BIND_VX_fetch_warp_sched + +`define SCOPE_IO_VX_warp_sched + +`define SCOPE_IO_VX_pipeline + +`define SCOPE_BIND_VX_pipeline_fetch + +`define SCOPE_IO_VX_core + +`define SCOPE_BIND_VX_core_pipeline + +`define SCOPE_IO_VX_cluster + +`define SCOPE_BIND_VX_cluster_core(__i__) + +`define SCOPE_IO_Vortex + +`define SCOPE_BIND_Vortex_cluster(__i__) + +`define SCOPE_BIND_top_vortex + +`define SCOPE_IO_VX_lsu_unit + +`define SCOPE_IO_VX_gpu_unit + +`define SCOPE_IO_VX_execute + +`define SCOPE_BIND_VX_execute_lsu_unit + +`define SCOPE_BIND_VX_execute_gpu_unit + +`define SCOPE_BIND_VX_pipeline_execute + +`define SCOPE_IO_VX_issue + +`define SCOPE_BIND_VX_pipeline_issue + +`define SCOPE_IO_VX_bank + +`define SCOPE_IO_VX_cache + +`define SCOPE_BIND_VX_cache_bank(__i__) + +`define SCOPE_BIND_Vortex_l3cache + +`define SCOPE_BIND_VX_cluster_l2cache + +`define SCOPE_IO_VX_mem_unit + +`define SCOPE_BIND_VX_mem_unit_dcache + +`define SCOPE_BIND_VX_core_mem_unit + +`define SCOPE_BIND_VX_mem_unit_icache + +`define SCOPE_BIND_VX_mem_unit_smem + +`define SCOPE_DECL_SIGNALS + +`define SCOPE_DATA_LIST + +`define SCOPE_UPDATE_LIST + +`define SCOPE_TRIGGER + +`define SCOPE_ASSIGN(d,s) + `endif - -// VX_SCOPE `endif \ No newline at end of file diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index dd7d22b5..26d051ae 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -28,12 +28,16 @@ typedef struct packed { logic [`NUM_THREADS-1:0] tmask; } gpu_tmc_t; +`define GPU_TMC_SIZE (1+`NUM_THREADS) + typedef struct packed { logic valid; logic [`NUM_WARPS-1:0] wmask; logic [31:0] pc; } gpu_wspawn_t; +`define GPU_WSPAWN_SIZE (1+`NUM_WARPS+32) + typedef struct packed { logic valid; logic diverged; @@ -42,10 +46,14 @@ typedef struct packed { logic [31:0] pc; } gpu_split_t; +`define GPU_SPLIT_SIZE (1+1+`NUM_THREADS+`NUM_THREADS+32) + typedef struct packed { logic valid; logic [`NB_BITS-1:0] id; logic [`NW_BITS-1:0] size_m1; } gpu_barrier_t; +`define GPU_BARRIER_SIZE (1+`NB_BITS+`NB_BITS) + `endif \ No newline at end of file diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 2f60776c..2938c60b 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -3,6 +3,8 @@ module VX_warp_sched #( parameter CORE_ID = 0 ) ( + `SCOPE_IO_VX_warp_sched + input wire clk, input wire reset, @@ -248,4 +250,11 @@ module VX_warp_sched #( assign busy = (active_warps != 0); + `SCOPE_ASSIGN (scope_wsched_scheduled_warp, scheduled_warp); + `SCOPE_ASSIGN (scope_wsched_active_warps, active_warps); + `SCOPE_ASSIGN (scope_wsched_schedule_table, schedule_table); + `SCOPE_ASSIGN (scope_wsched_schedule_ready, schedule_ready); + `SCOPE_ASSIGN (scope_wsched_warp_to_schedule, warp_to_schedule); + `SCOPE_ASSIGN (scope_wsched_warp_pc, warp_pc); + endmodule \ No newline at end of file diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 58e01f55..772ac3c0 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -25,6 +25,7 @@ module VX_writeback #( wire wb_valid; wire [`NW_BITS-1:0] wb_wid; + wire [31:0] wb_PC; wire [`NUM_THREADS-1:0] wb_tmask; wire [`NR_BITS-1:0] wb_rd; wire [`NUM_THREADS-1:0][31:0] wb_data; @@ -42,6 +43,13 @@ module VX_writeback #( mul_valid ? mul_commit_if.wid : fpu_valid ? fpu_commit_if.wid : 0; + + assign wb_PC = alu_valid ? alu_commit_if.PC : + lsu_valid ? lsu_commit_if.PC : + csr_valid ? csr_commit_if.PC : + mul_valid ? mul_commit_if.PC : + fpu_valid ? fpu_commit_if.PC : + 0; assign wb_tmask = alu_valid ? alu_commit_if.tmask : lsu_valid ? lsu_commit_if.tmask : @@ -68,16 +76,16 @@ module VX_writeback #( wire stall = 0/*~writeback_if.ready && writeback_if.valid*/; VX_generic_register #( - .N(1 + `NW_BITS + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32)) + .N(1 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32)) ) wb_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (1'b0), - .in ({wb_valid, wb_wid, wb_tmask, wb_rd, wb_data}), - .out ({writeback_if.valid, writeback_if.wid, writeback_if.tmask, writeback_if.rd, writeback_if.data}) + .in ({wb_valid, wb_wid, wb_PC, wb_tmask, wb_rd, wb_data}), + .out ({writeback_if.valid, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.data}) ); - + assign alu_commit_if.ready = !stall; assign lsu_commit_if.ready = !stall && !alu_valid; assign csr_commit_if.ready = !stall && !alu_valid && !lsu_valid; diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index aa328828..914a9232 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -1,11 +1,7 @@ `include "VX_define.vh" module Vortex ( - `SCOPE_SIGNALS_ISTAGE_IO - `SCOPE_SIGNALS_LSU_IO - `SCOPE_SIGNALS_CACHE_IO - `SCOPE_SIGNALS_ISSUE_IO - `SCOPE_SIGNALS_EXECUTE_IO + `SCOPE_IO_Vortex // Clock input wire clk, @@ -75,11 +71,7 @@ module Vortex ( VX_cluster #( .CLUSTER_ID(0) ) cluster ( - `SCOPE_SIGNALS_ISTAGE_BIND - `SCOPE_SIGNALS_LSU_BIND - `SCOPE_SIGNALS_CACHE_BIND - `SCOPE_SIGNALS_ISSUE_BIND - `SCOPE_SIGNALS_EXECUTE_BIND + `SCOPE_BIND_Vortex_cluster(0) .clk (clk), .reset (reset), @@ -193,11 +185,7 @@ module Vortex ( VX_cluster #( .CLUSTER_ID(i) ) cluster ( - `SCOPE_SIGNALS_ISTAGE_BIND - `SCOPE_SIGNALS_LSU_BIND - `SCOPE_SIGNALS_CACHE_BIND - `SCOPE_SIGNALS_ISSUE_BIND - `SCOPE_SIGNALS_EXECUTE_BIND + `SCOPE_BIND_Vortex_cluster(i) .clk (clk), .reset (reset), @@ -384,7 +372,7 @@ module Vortex ( assign l3_core_rsp_ready = (& per_cluster_dram_rsp_ready); VX_cache #( - .CACHE_ID (0), + .CACHE_ID (`L3CACHE_ID), .CACHE_SIZE (`L3CACHE_SIZE), .BANK_LINE_SIZE (`L3BANK_LINE_SIZE), .NUM_BANKS (`L3NUM_BANKS), @@ -407,7 +395,7 @@ module Vortex ( .SNP_REQ_TAG_WIDTH (`L3SNP_TAG_WIDTH), .SNP_FWD_TAG_WIDTH (`L2SNP_TAG_WIDTH) ) l3cache ( - `SCOPE_SIGNALS_CACHE_UNBIND + `SCOPE_BIND_Vortex_l3cache .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index ee6ab088..4ffc0e1c 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -50,7 +50,7 @@ module VX_bank #( // Snooping request tag width parameter SNP_REQ_TAG_WIDTH = 0 ) ( - `SCOPE_SIGNALS_CACHE_IO + `SCOPE_IO_VX_bank input wire clk, input wire reset, @@ -146,7 +146,7 @@ module VX_bank #( ) snp_req_queue ( .clk (clk), .reset (reset), - .push (snp_req_valid), + .push (snp_req_valid && snp_req_ready), .data_in ({snp_req_addr, snp_req_invalidate, snp_req_tag}), .pop (snrq_pop), .data_out({snrq_addr_st0, snrq_invalidate_st0, snrq_tag_st0}), @@ -169,7 +169,7 @@ module VX_bank #( ) dfp_queue ( .clk (clk), .reset (reset), - .push (dram_fill_rsp_valid), + .push (dram_fill_rsp_valid && dram_fill_rsp_ready), .data_in ({dram_fill_rsp_addr, dram_fill_rsp_data}), .pop (dfpq_pop), .data_out({dfpq_addr_st0, dfpq_filldata_st0}), @@ -266,7 +266,9 @@ module VX_bank #( `DEBUG_BEGIN wire going_to_write_st1; `DEBUG_END - + + //determines if the if it is time to pop a req from the queues + //unqual - the req does NOT qualify for execution in the bank. wire mrvq_pop_unqual = mrvq_valid_st0; wire dfpq_pop_unqual = !mrvq_pop_unqual && !dfpq_empty; wire reqq_pop_unqual = !mrvq_stop && !mrvq_pop_unqual && !dfpq_pop_unqual && !reqq_empty && reqq_req_st0 && !is_fill_st1 && !is_fill_st1; @@ -276,7 +278,8 @@ module VX_bank #( assign dfpq_pop = dfpq_pop_unqual && !stall_bank_pipe; assign reqq_pop = reqq_pop_unqual && !stall_bank_pipe; assign snrq_pop = snrq_pop_unqual && !stall_bank_pipe; - + + //signals to progress to the next stage wire qual_is_fill_st0; wire qual_valid_st0; wire [`LINE_ADDR_WIDTH-1:0] qual_addr_st0; @@ -289,7 +292,8 @@ module VX_bank #( wire qual_going_to_write_st0; wire qual_is_snp_st0; wire qual_snp_invalidate_st0; - + + //signals to be *used* in the next stage wire valid_st1; wire [`LINE_ADDR_WIDTH-1:0] addr_st1; wire [`UP(`WORD_SELECT_WIDTH)-1:0] wsel_st1; @@ -300,15 +304,19 @@ module VX_bank #( wire snp_invalidate_st1; wire is_mrvq_st1; - assign qual_is_fill_st0 = dfpq_pop_unqual; + //Determine which req will progress to the next stage + assign qual_is_fill_st0 = dfpq_pop_unqual; //dram is filling a request - assign qual_valid_st0 = dfpq_pop || mrvq_pop || reqq_pop || snrq_pop; + assign qual_valid_st0 = dfpq_pop || mrvq_pop || reqq_pop || snrq_pop; //valid if something is being popped - assign qual_addr_st0 = dfpq_pop_unqual ? dfpq_addr_st0 : - mrvq_pop_unqual ? mrvq_addr_st0 : + //Decides which request to deal with. Priority: 1) Miss reserve 2) DRAM fill 3) Core req 4) Snp req + assign qual_addr_st0 = mrvq_pop_unqual ? mrvq_addr_st0 : + dfpq_pop_unqual ? dfpq_addr_st0 : reqq_pop_unqual ? reqq_req_addr_st0[`LINE_SELECT_ADDR_RNG] : snrq_pop_unqual ? snrq_addr_st0 : 0; + + //Word select does ? Does this just pick a specific word from the line instead of the whole line? if (`WORD_SELECT_WIDTH != 0) begin assign qual_wsel_st0 = reqq_pop_unqual ? reqq_req_addr_st0[`WORD_SELECT_WIDTH-1:0] : mrvq_pop_unqual ? mrvq_wsel_st0 : @@ -318,30 +326,35 @@ module VX_bank #( assign qual_wsel_st0 = 0; end + //if you are filling from dram then that is the write data? What about core? What is 57? assign qual_writedata_st0 = dfpq_pop_unqual ? dfpq_filldata_st0 : 57; + //note that this is stored even if a DRAM fill is processed assign qual_inst_meta_st0 = mrvq_pop_unqual ? {`REQ_TAG_WIDTH'(mrvq_tag_st0) , mrvq_rw_st0, mrvq_byteen_st0, mrvq_tid_st0} : reqq_pop_unqual ? {`REQ_TAG_WIDTH'(reqq_req_tag_st0), reqq_req_rw_st0, reqq_req_byteen_st0, reqq_req_tid_st0} : snrq_pop_unqual ? {`REQ_TAG_WIDTH'(snrq_tag_st0), 1'b0, WORD_SIZE'(0), `REQS_BITS'(0)} : 0; - + + assign qual_going_to_write_st0 = dfpq_pop_unqual ? 1 : (mrvq_pop_unqual && mrvq_rw_st0) ? 1 : (reqq_pop_unqual && reqq_req_rw_st0) ? 1 : 0; + //snp signals check to see if the miss reserve as a snp in it first. assign qual_is_snp_st0 = mrvq_pop_unqual ? mrvq_is_snp_st0 : snrq_pop_unqual ? 1 : 0; - + //if we are popping from the miss reserve then assign to the mrvq invalidate. If not and popping from the snoop queue use the snoop invalidate. Else this is 0 assign qual_snp_invalidate_st0 = mrvq_pop_unqual ? mrvq_snp_invalidate_st0 : snrq_pop_unqual ? snrq_invalidate_st0 : 0; - + //choose which word of the lien is being written to assign qual_writeword_st0 = mrvq_pop_unqual ? mrvq_writeword_st0 : reqq_pop_unqual ? reqq_req_writeword_st0 : 0; + assign qual_is_mrvq_st0 = mrvq_pop_unqual; `ifdef DBG_CORE_REQ_INFO @@ -356,7 +369,7 @@ module VX_bank #( .clk (clk), .reset (reset), .stall (stall_bank_pipe), - .flush (0), + .flush (1'b0), .in ({qual_is_mrvq_st0, qual_is_snp_st0, qual_snp_invalidate_st0, qual_going_to_write_st0, qual_valid_st0, qual_addr_st0, qual_wsel_st0, qual_writeword_st0, qual_inst_meta_st0, qual_is_fill_st0, qual_writedata_st0}), .out ({is_mrvq_st1 , is_snp_st1, snp_invalidate_st1, going_to_write_st1, valid_st1, addr_st1, wsel_st1, writeword_st1, inst_meta_st1, is_fill_st1, writedata_st1}) ); @@ -453,6 +466,8 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = inst_meta_st1; + end else begin + assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = 0; end `endif @@ -486,7 +501,7 @@ module VX_bank #( .clk (clk), .reset (reset), .stall (stall_bank_pipe), - .flush (0), + .flush (1'b0), .in ({mrvq_recover_ready_state_st1, is_mrvq_st1_st2, mrvq_init_ready_state_st1, snp_to_mrvq_st1, is_snp_st1, snp_invalidate_st1, fill_saw_dirty_st1, is_fill_st1, qual_valid_st1_2, addr_st1, wsel_st1, writeword_st1, readword_st1, readdata_st1, readtag_st1, miss_st1, dirty_st1, dirtyb_st1, inst_meta_st1}), .out ({mrvq_recover_ready_state_st2 , is_mrvq_st2 , mrvq_init_ready_state_unqual_st2, snp_to_mrvq_st2 , is_snp_st2 , snp_invalidate_st2, fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2, wsel_st2, writeword_st2, readword_st2, readdata_st2, readtag_st2, miss_st2, dirty_st2, dirtyb_st2, inst_meta_st2}) ); @@ -728,18 +743,18 @@ module VX_bank #( end `endif -`SCOPE_ASSIGN (scope_bank_valid_st0, qual_valid_st0); -`SCOPE_ASSIGN (scope_bank_valid_st1, valid_st1); -`SCOPE_ASSIGN (scope_bank_valid_st2, valid_st2); +`SCOPE_ASSIGN (scope_valid_st0, qual_valid_st0); +`SCOPE_ASSIGN (scope_valid_st1, valid_st1); +`SCOPE_ASSIGN (scope_valid_st2, valid_st2); -`SCOPE_ASSIGN (scope_bank_is_mrvq_st1, is_mrvq_st1); -`SCOPE_ASSIGN (scope_bank_miss_st1, miss_st1); -`SCOPE_ASSIGN (scope_bank_dirty_st1, dirty_st1); -`SCOPE_ASSIGN (scope_bank_force_miss_st1, force_request_miss_st1); -`SCOPE_ASSIGN (scope_bank_stall_pipe, stall_bank_pipe); +`SCOPE_ASSIGN (scope_is_mrvq_st1, is_mrvq_st1); +`SCOPE_ASSIGN (scope_miss_st1, miss_st1); +`SCOPE_ASSIGN (scope_dirty_st1, dirty_st1); +`SCOPE_ASSIGN (scope_force_miss_st1, force_request_miss_st1); +`SCOPE_ASSIGN (scope_stall_pipe, stall_bank_pipe); -`SCOPE_ASSIGN (scope_bank_addr_st0, `LINE_TO_BYTE_ADDR(qual_addr_st0, BANK_ID)); -`SCOPE_ASSIGN (scope_bank_addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); -`SCOPE_ASSIGN (scope_bank_addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID)); +`SCOPE_ASSIGN (scope_addr_st0, `LINE_TO_BYTE_ADDR(qual_addr_st0, BANK_ID)); +`SCOPE_ASSIGN (scope_addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); +`SCOPE_ASSIGN (scope_addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID)); endmodule diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index afcdf612..bf3f55df 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -51,15 +51,15 @@ module VX_cache #( parameter DRAM_TAG_WIDTH = 28, // Number of snoop forwarding requests - parameter NUM_SNP_REQUESTS = 2, + parameter NUM_SNP_REQUESTS = 1, // Snooping request tag width - parameter SNP_REQ_TAG_WIDTH = 28, + parameter SNP_REQ_TAG_WIDTH = 1, // Snooping forward tag width parameter SNP_FWD_TAG_WIDTH = 1 ) ( - `SCOPE_SIGNALS_CACHE_IO + `SCOPE_IO_VX_cache input wire clk, input wire reset, @@ -167,7 +167,7 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_miss; assign miss_vec = per_bank_miss; - `SCOPE_SIGNALS_CACHE_BANK_SELECT + wire snp_req_valid_qual; wire [`DRAM_ADDR_WIDTH-1:0] snp_req_addr_qual; @@ -376,7 +376,7 @@ module VX_cache #( .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS), .SNP_REQ_TAG_WIDTH (SNP_REQ_TAG_WIDTH) ) bank ( - `SCOPE_SIGNALS_CACHE_BANK_BIND + `SCOPE_BIND_VX_cache_bank(i) .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_cache_config.vh b/hw/rtl/cache/VX_cache_config.vh index 1237c1c6..b8fe1d46 100644 --- a/hw/rtl/cache/VX_cache_config.vh +++ b/hw/rtl/cache/VX_cache_config.vh @@ -2,7 +2,6 @@ `define VX_CACHE_CONFIG `include "VX_platform.vh" -`include "VX_scope.vh" `ifdef DBG_CORE_REQ_INFO `include "VX_define.vh" diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 7cf8b1c8..7cd20e43 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -91,7 +91,7 @@ module VX_cache_core_rsp_merge #( .clk (clk), .reset (reset), .stall (stall), - .flush (0), + .flush (1'b0), .in ({core_rsp_valid_unqual, core_rsp_data_unqual, core_rsp_tag_unqual}), .out ({core_rsp_valid, core_rsp_data, core_rsp_tag}) ); diff --git a/hw/rtl/cache/VX_cache_miss_resrv.v b/hw/rtl/cache/VX_cache_miss_resrv.v index 61c4bcfb..f56d638e 100644 --- a/hw/rtl/cache/VX_cache_miss_resrv.v +++ b/hw/rtl/cache/VX_cache_miss_resrv.v @@ -56,8 +56,9 @@ module VX_cache_miss_resrv #( output wire miss_resrv_is_snp_st0, output wire miss_resrv_snp_invalidate_st0 ); - reg [`MRVQ_METADATA_WIDTH-1:0] metadata_table[MRVQ_SIZE-1:0]; + wire [`MRVQ_METADATA_WIDTH-1:0] metadata_table; reg [MRVQ_SIZE-1:0][`LINE_ADDR_WIDTH-1:0] addr_table; + reg [MRVQ_SIZE-1:0] valid_table; reg [MRVQ_SIZE-1:0] ready_table; reg [`LOG2UP(MRVQ_SIZE)-1:0] schedule_ptr; @@ -66,13 +67,13 @@ module VX_cache_miss_resrv #( reg [`LOG2UP(MRVQ_SIZE+1)-1:0] size; - `STATIC_ASSERT(MRVQ_SIZE > 5, "invalid size") + `STATIC_ASSERT(MRVQ_SIZE > 5, ("invalid size")) assign miss_resrv_full = (size == $bits(size)'(MRVQ_SIZE)); assign miss_resrv_stop = (size > $bits(size)'(MRVQ_SIZE-5)); // need to add 5 cycles to prevent pipeline lock - wire enqueue_possible = !miss_resrv_full; - wire [`LOG2UP(MRVQ_SIZE)-1:0] enqueue_index = tail_ptr; + wire enqueue_possible = !miss_resrv_full; + wire [`LOG2UP(MRVQ_SIZE)-1:0] enqueue_index = tail_ptr; reg [MRVQ_SIZE-1:0] make_ready; reg [MRVQ_SIZE-1:0] make_ready_push; @@ -85,11 +86,11 @@ module VX_cache_miss_resrv #( assign pending_hazard_st1 = |(valid_address_match); - wire dequeue_possible = valid_table[schedule_ptr] && ready_table[schedule_ptr]; + wire dequeue_possible = valid_table[schedule_ptr] && ready_table[schedule_ptr]; wire [`LOG2UP(MRVQ_SIZE)-1:0] dequeue_index = schedule_ptr; assign miss_resrv_valid_st0 = dequeue_possible; - assign miss_resrv_addr_st0 = addr_table[dequeue_index]; + assign miss_resrv_addr_st0 = addr_table[dequeue_index]; assign {miss_resrv_data_st0, miss_resrv_tid_st0, miss_resrv_tag_st0, @@ -97,7 +98,7 @@ module VX_cache_miss_resrv #( miss_resrv_byteen_st0, miss_resrv_wsel_st0, miss_resrv_is_snp_st0, - miss_resrv_snp_invalidate_st0} = metadata_table[dequeue_index]; + miss_resrv_snp_invalidate_st0} = metadata_table; wire mrvq_push = miss_add && enqueue_possible && !is_mrvq; wire mrvq_pop = miss_resrv_pop && dequeue_possible; @@ -124,13 +125,12 @@ module VX_cache_miss_resrv #( valid_table[enqueue_index] <= 1; ready_table[enqueue_index] <= mrvq_init_ready_state; addr_table[enqueue_index] <= miss_add_addr; - metadata_table[enqueue_index] <= {miss_add_data, miss_add_tid, miss_add_tag, miss_add_rw, miss_add_byteen, miss_add_wsel, miss_add_is_snp, miss_add_snp_invalidate}; - tail_ptr <= tail_ptr + 1; + tail_ptr <= tail_ptr + $bits(tail_ptr)'(1); end else if (increment_head) begin valid_table[head_ptr] <= 0; - head_ptr <= head_ptr + 1; + head_ptr <= head_ptr + $bits(head_ptr)'(1); end else if (recover_state) begin - schedule_ptr <= schedule_ptr - 1; + schedule_ptr <= schedule_ptr - $bits(schedule_ptr)'(1); end // update entry as 'ready' during DRAM fill response @@ -140,20 +140,36 @@ module VX_cache_miss_resrv #( if (mrvq_pop) begin ready_table[dequeue_index] <= 0; - schedule_ptr <= schedule_ptr + 1; + schedule_ptr <= schedule_ptr + $bits(schedule_ptr)'(1); end if (!(mrvq_push && increment_head)) begin if (mrvq_push) begin - size <= size + 1; + size <= size + $bits(size)'(1); end if (increment_head) begin - size <= size - 1; + size <= size - $bits(size)'(1); end end end end + VX_dp_ram #( + .DATAW(`MRVQ_METADATA_WIDTH), + .SIZE(MRVQ_SIZE), + .BYTEENW(1), + .BUFFERED(0), + .RWCHECK(1) + ) metadata_ram ( + .clk(clk), + .waddr(enqueue_index), + .raddr(dequeue_index), + .wren(mrvq_push), + .rden(1'b1), + .din({miss_add_data, miss_add_tid, miss_add_tag, miss_add_rw, miss_add_byteen, miss_add_wsel, miss_add_is_snp, miss_add_snp_invalidate}), + .dout(metadata_table) + ); + `ifdef DBG_PRINT_CACHE_MSRQ always @(posedge clk) begin if (mrvq_push || mrvq_pop || increment_head || recover_state) begin diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index d1a7433a..bffc4679 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -37,7 +37,7 @@ module VX_snp_forwarder #( input wire [NUM_REQUESTS-1:0][`LOG2UP(SNRQ_SIZE)-1:0] snp_fwdin_tag, output wire [NUM_REQUESTS-1:0] snp_fwdin_ready ); - `STATIC_ASSERT(NUM_REQUESTS > 1, "invalid value") + `STATIC_ASSERT(NUM_REQUESTS > 1, ("invalid value")) reg [`REQS_BITS:0] pending_cntrs [SNRQ_SIZE-1:0]; diff --git a/hw/rtl/cache/VX_tag_data_access.v b/hw/rtl/cache/VX_tag_data_access.v index a1b80838..6d6d8572 100644 --- a/hw/rtl/cache/VX_tag_data_access.v +++ b/hw/rtl/cache/VX_tag_data_access.v @@ -183,15 +183,15 @@ module VX_tag_data_access #( if (valid_req_st1) begin if ((| use_write_enable)) begin if (writefill_st1) begin - $display("%t: cache%0d:%0d store-fill: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1, debug_pc_st1, debug_tagid_st1, debug_rd_st1, dirty_st1, writeladdr_st1, writetag_st1, use_write_data); + $display("%t: cache%0d:%0d data-fill: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1, debug_pc_st1, debug_tagid_st1, debug_rd_st1, dirty_st1, writeladdr_st1, writetag_st1, use_write_data); end else begin - $display("%t: cache%0d:%0d store-write: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1, debug_pc_st1, debug_tagid_st1, debug_rd_st1, dirty_st1, writeladdr_st1, writetag_st1, wordsel_st1, writeword_st1); + $display("%t: cache%0d:%0d data-write: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1, debug_pc_st1, debug_tagid_st1, debug_rd_st1, dirty_st1, writeladdr_st1, writetag_st1, wordsel_st1, writeword_st1); end end else if (miss_st1) begin - $display("%t: cache%0d:%0d store-miss: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b", $time, CACHE_ID, BANK_ID, debug_wid_st1, debug_pc_st1, debug_tagid_st1, debug_rd_st1, dirty_st1); + $display("%t: cache%0d:%0d data-miss: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b", $time, CACHE_ID, BANK_ID, debug_wid_st1, debug_pc_st1, debug_tagid_st1, debug_rd_st1, dirty_st1); end else begin - $display("%t: cache%0d:%0d store-read: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1, debug_pc_st1, debug_tagid_st1, debug_rd_st1, dirty_st1, readaddr_st1, qual_read_tag_st1, wordsel_st1, qual_read_data_st1); + $display("%t: cache%0d:%0d data-read: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1, debug_pc_st1, debug_tagid_st1, debug_rd_st1, dirty_st1, readaddr_st1, qual_read_tag_st1, wordsel_st1, qual_read_data_st1); end end end diff --git a/hw/rtl/cache/VX_tag_data_store.v b/hw/rtl/cache/VX_tag_data_store.v index e0f356cc..d3a022b2 100644 --- a/hw/rtl/cache/VX_tag_data_store.v +++ b/hw/rtl/cache/VX_tag_data_store.v @@ -6,7 +6,7 @@ module VX_tag_data_store #( // Size of line inside a bank in bytes parameter BANK_LINE_SIZE = 0, // Number of banks {1, 2, 4, 8,...} - parameter NUM_BANKS = 0, + parameter NUM_BANKS = 0, //unused parameter? // Size of a word in bytes parameter WORD_SIZE = 0 ) ( @@ -30,7 +30,6 @@ module VX_tag_data_store #( input wire fill_sent ); - reg [`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0][7:0] data [`BANK_LINE_COUNT-1:0]; reg [`TAG_SELECT_BITS-1:0] tag [`BANK_LINE_COUNT-1:0]; reg [`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0] dirtyb[`BANK_LINE_COUNT-1:0]; reg [`BANK_LINE_COUNT-1:0] dirty; @@ -40,8 +39,7 @@ module VX_tag_data_store #( assign read_dirty = dirty [read_addr]; assign read_dirtyb = dirtyb [read_addr]; assign read_tag = tag [read_addr]; - assign read_data = data [read_addr]; - + wire do_write = (| write_enable); always @(posedge clk) begin @@ -69,15 +67,26 @@ module VX_tag_data_store #( if (invalidate) begin valid[write_addr] <= 0; end - - for (integer j = 0; j < `BANK_LINE_WORDS; j++) begin - for (integer i = 0; i < WORD_SIZE; i++) begin - if (write_enable[j][i]) begin - data[write_addr][j][i] <= write_data[j * `WORD_WIDTH + i * 8 +: 8]; - end - end - end end end -endmodule \ No newline at end of file + wire [(`BANK_LINE_WORDS * WORD_SIZE)-1:0] ram_wren; + assign ram_wren = write_enable & {(`BANK_LINE_WORDS * WORD_SIZE){!stall_bank_pipe}}; + + VX_dp_ram #( + .DATAW(`BANK_LINE_WORDS * WORD_SIZE * 8), + .SIZE(`BANK_LINE_COUNT), + .BYTEENW(`BANK_LINE_WORDS * WORD_SIZE), + .BUFFERED(0), + .RWCHECK(1) + ) dp_ram ( + .clk(clk), + .waddr(write_addr), + .raddr(read_addr), + .wren(ram_wren), + .rden(1'b1), + .din(write_data), + .dout(read_data) + ); + +endmodule diff --git a/hw/rtl/libs/VX_dp_ram.v b/hw/rtl/libs/VX_dp_ram.v new file mode 100644 index 00000000..01a0a167 --- /dev/null +++ b/hw/rtl/libs/VX_dp_ram.v @@ -0,0 +1,145 @@ +`include "VX_platform.vh" + +module VX_dp_ram #( + parameter DATAW = 1, + parameter SIZE = 1, + parameter BYTEENW = 1, + parameter BUFFERED = 1, + parameter RWCHECK = 1, + parameter RWBYPASS = 0, + parameter ADDRW = $clog2(SIZE), + parameter SIZEW = $clog2(SIZE+1) +) ( + input wire clk, + input wire [ADDRW-1:0] waddr, + input wire [ADDRW-1:0] raddr, + input wire [BYTEENW-1:0] wren, + input wire rden, + input wire [DATAW-1:0] din, + output wire [DATAW-1:0] dout +); + + if (BUFFERED) begin + + reg [DATAW-1:0] mem [SIZE-1:0]; + reg [DATAW-1:0] dout_r; + + if (BYTEENW > 1) begin + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8]; + end + end + end else begin + always @(posedge clk) begin + if (wren) + mem[waddr] <= din; + end + end + + always @(posedge clk) begin + if (rden) + dout_r <= mem[raddr]; + end + + if (RWBYPASS) begin + reg [DATAW-1:0] din_r; + wire writing; + + if (BYTEENW > 1) begin + assign writing = (| wren); + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + din_r[i * 8 +: 8] <= wren[i] ? din[i * 8 +: 8] : mem[waddr][i * 8 +: 8]; + end + end + end else begin + assign writing = wren; + always @(posedge clk) begin + din_r <= din; + end + end + + reg bypass_r; + always @(posedge clk) begin + bypass_r <= writing && (raddr == waddr); + end + + assign dout = bypass_r ? din_r : dout_r; + end else begin + assign dout = dout_r; + end + + end else begin + + `UNUSED_VAR(rden) + + if (RWCHECK) begin + + reg [DATAW-1:0] mem [SIZE-1:0]; + + if (BYTEENW > 1) begin + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8]; + end + end + end else begin + always @(posedge clk) begin + if (wren) + mem[waddr] <= din; + end + end + + if (RWBYPASS) begin + reg [DATAW-1:0] din_r; + wire writing; + + if (BYTEENW > 1) begin + assign writing = (| wren); + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + din_r[i * 8 +: 8] <= wren[i] ? din[i * 8 +: 8] : mem[waddr][i * 8 +: 8]; + end + end + end else begin + assign writing = wren; + always @(posedge clk) begin + din_r <= din; + end + end + + reg bypass_r; + always @(posedge clk) begin + bypass_r <= writing && (raddr == waddr); + end + + assign dout = bypass_r ? din_r : mem[raddr]; + end else begin + assign dout = mem[raddr]; + end + + end else begin + + `NO_RW_RAM_CHECK reg [DATAW-1:0] mem [SIZE-1:0]; + + if (BYTEENW > 1) begin + always @(posedge clk) begin + for (integer i = 0; i < BYTEENW; i++) begin + if (wren[i]) + mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8]; + end + end + end else begin + always @(posedge clk) begin + if (wren) + mem[waddr] <= din; + end + end + assign dout = mem[raddr]; + end + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index 4164fdad..bb5010b7 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -3,7 +3,7 @@ module VX_generic_queue #( parameter DATAW = 1, parameter SIZE = 2, - parameter BUFFERED = 0, + parameter BUFFERED = 1, parameter ADDRW = $clog2(SIZE), parameter SIZEW = $clog2(SIZE+1) ) ( @@ -17,30 +17,26 @@ module VX_generic_queue #( output wire full, output wire [SIZEW-1:0] size ); - `STATIC_ASSERT(`ISPOW2(SIZE), "must be 0 or power of 2!") - - reg [SIZEW-1:0] size_r; - wire reading; - wire writing; - - assign reading = pop && !empty; - assign writing = push && !full; - - if (SIZE == 1) begin // (SIZE == 1) + `STATIC_ASSERT(`ISPOW2(SIZE), ("must be 0 or power of 2!")) + + if (SIZE == 1) begin reg [DATAW-1:0] head_r; + reg size_r; always @(posedge clk) begin if (reset) begin head_r <= 0; size_r <= 0; end else begin - if (writing && !reading) begin + if (push && !pop) begin + assert(!full); size_r <= 1; - end else if (reading && !writing) begin + end else if (pop && !push) begin + assert(!empty); size_r <= 0; end - if (writing) begin + if (push) begin head_r <= data_in; end end @@ -51,15 +47,14 @@ module VX_generic_queue #( assign full = (size_r != 0); assign size = size_r; - end else begin // (SIZE > 1) + end else begin - `USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0]; - - if (0 == BUFFERED) begin + if (0 == BUFFERED) begin reg [ADDRW:0] rd_ptr_r; reg [ADDRW:0] wr_ptr_r; - + reg [ADDRW-1:0] used_r; + wire [ADDRW-1:0] rd_ptr_a = rd_ptr_r[ADDRW-1:0]; wire [ADDRW-1:0] wr_ptr_a = wr_ptr_r[ADDRW-1:0]; @@ -67,96 +62,126 @@ module VX_generic_queue #( if (reset) begin rd_ptr_r <= 0; wr_ptr_r <= 0; - size_r <= 0; + used_r <= 0; end else begin - if (writing) begin - data[wr_ptr_a] <= data_in; - wr_ptr_r <= wr_ptr_r + 1; - if (!reading) begin - size_r <= size_r + 1; + if (push) begin + assert(!full); + wr_ptr_r <= wr_ptr_r + (ADDRW+1)'(1); + if (!pop) begin + used_r <= used_r + ADDRW'(1); end end - - if (reading) begin - rd_ptr_r <= rd_ptr_r + 1; - if (!writing) begin - size_r <= size_r - 1; + if (pop) begin + assert(!empty); + rd_ptr_r <= rd_ptr_r + (ADDRW+1)'(1); + if (!push) begin + used_r <= used_r - ADDRW'(1); end end end - end + end - assign data_out = data[rd_ptr_a]; - assign empty = (wr_ptr_r == rd_ptr_r); - assign full = (wr_ptr_a == rd_ptr_a) && (wr_ptr_r[ADDRW] != rd_ptr_r[ADDRW]); - assign size = size_r; + VX_dp_ram #( + .DATAW(DATAW), + .SIZE(SIZE), + .BUFFERED(0), + .RWCHECK(1) + ) dp_ram ( + .clk(clk), + .waddr(wr_ptr_a), + .raddr(rd_ptr_a), + .wren(push), + .rden(pop), + .din(data_in), + .dout(data_out) + ); + + assign empty = (wr_ptr_r == rd_ptr_r); + assign full = (wr_ptr_a == rd_ptr_a) && (wr_ptr_r[ADDRW] != rd_ptr_r[ADDRW]); + assign size = {full, used_r}; end else begin - reg [DATAW-1:0] head_r; - reg [DATAW-1:0] curr_r; + wire [DATAW-1:0] dout; + + reg [DATAW-1:0] din_r; reg [ADDRW-1:0] wr_ptr_r; reg [ADDRW-1:0] rd_ptr_r; - reg [ADDRW-1:0] rd_ptr_next_r; + reg [ADDRW-1:0] rd_ptr_n_r; + reg [ADDRW-1:0] used_r; reg empty_r; reg full_r; reg bypass_r; always @(posedge clk) begin - if (reset) begin - size_r <= 0; - head_r <= 0; - curr_r <= 0; - wr_ptr_r <= 0; - rd_ptr_r <= 0; - rd_ptr_next_r <= 1; - empty_r <= 1; - full_r <= 0; + if (reset) begin + wr_ptr_r <= 0; + rd_ptr_r <= 0; + rd_ptr_n_r <= 1; + empty_r <= 1; + full_r <= 0; + used_r <= 0; end else begin - if (writing) begin - data[wr_ptr_r] <= data_in; - wr_ptr_r <= wr_ptr_r + 1; + if (push) begin + wr_ptr_r <= wr_ptr_r + ADDRW'(1); - if (!reading) begin + if (!pop) begin empty_r <= 0; - if (size_r == ($bits(size_r)'(SIZE-1))) begin + if (used_r == ADDRW'(SIZE-1)) begin full_r <= 1; end - size_r <= size_r + 1; + used_r <= used_r + ADDRW'(1); end end - if (reading) begin - rd_ptr_r <= rd_ptr_next_r; + if (pop) begin + rd_ptr_r <= rd_ptr_n_r; if (SIZE > 2) begin - rd_ptr_next_r <= rd_ptr_r + $bits(rd_ptr_r)'(2); + rd_ptr_n_r <= rd_ptr_r + ADDRW'(2); end else begin // (SIZE == 2); - rd_ptr_next_r <= ~rd_ptr_next_r; + rd_ptr_n_r <= ~rd_ptr_n_r; end - if (!writing) begin - if (size_r == 1) begin - assert(rd_ptr_next_r == wr_ptr_r); + if (!push) begin + full_r <= 0; + if (used_r == ADDRW'(1)) begin + assert(rd_ptr_n_r == wr_ptr_r); empty_r <= 1; - end; - full_r <= 0; - size_r <= size_r - 1; + end; + used_r <= used_r - ADDRW'(1); end end - - bypass_r <= writing - && (empty_r || ((1 == size_r) && reading)); // empty or about to go empty - - curr_r <= data_in; - head_r <= data[reading ? rd_ptr_next_r : rd_ptr_r]; end - end + end - assign data_out = bypass_r ? curr_r : head_r; + always @(posedge clk) begin + if (push && (empty_r || ((used_r == ADDRW'(1)) && pop))) begin + bypass_r <= 1; + din_r <= data_in; + end else if (pop) + bypass_r <= 0; + end + + VX_dp_ram #( + .DATAW(DATAW), + .SIZE(SIZE), + .BUFFERED(1), + .RWCHECK(0) + ) dp_ram ( + .clk(clk), + .waddr(wr_ptr_r), + .raddr(rd_ptr_n_r), + .wren(push), + .rden(pop), + .din(data_in), + .dout(dout) + ); + + assign data_out = bypass_r ? din_r : dout; assign empty = empty_r; assign full = full_r; - assign size = size_r; + assign size = {full_r, used_r}; end end diff --git a/hw/rtl/libs/VX_index_queue.v b/hw/rtl/libs/VX_index_queue.v index bee8ccb9..b40aa2a0 100644 --- a/hw/rtl/libs/VX_index_queue.v +++ b/hw/rtl/libs/VX_index_queue.v @@ -28,9 +28,13 @@ module VX_index_queue #( assign empty = (wr_ptr == rd_ptr); assign full = (wr_a == rd_a) && (wr_ptr[`LOG2UP(SIZE)] != rd_ptr[`LOG2UP(SIZE)]); - assign enqueue = push && !full; + assign enqueue = push; assign dequeue = !empty && !valid[rd_a]; // auto-remove when head is invalid + always @(*) begin + assert(!push || !full); + end + always @(posedge clk) begin if (reset) begin rd_ptr <= 0; diff --git a/hw/rtl/libs/VX_scope.v b/hw/rtl/libs/VX_scope.v index 2b924776..8b089259 100644 --- a/hw/rtl/libs/VX_scope.v +++ b/hw/rtl/libs/VX_scope.v @@ -18,7 +18,7 @@ module VX_scope #( input wire bus_write, input wire bus_read ); - localparam DELTA_ENABLE = (UPDW != 0); + localparam UPDW_ENABLE = (UPDW != 0); localparam MAX_DELTA = (2 ** DELTAW) - 1; localparam CMD_GET_VALID = 3'd0; @@ -27,19 +27,22 @@ module VX_scope #( localparam CMD_GET_COUNT = 3'd3; localparam CMD_SET_DELAY = 3'd4; localparam CMD_SET_STOP = 3'd5; - localparam CMD_RESERVED1 = 3'd6; + localparam CMD_GET_OFFSET= 3'd6; localparam CMD_RESERVED2 = 3'd7; - localparam GET_VALID = 2'd0; - localparam GET_DATA = 2'd1; - localparam GET_WIDTH = 2'd2; - localparam GET_COUNT = 2'd3; + localparam GET_VALID = 3'd0; + localparam GET_DATA = 3'd1; + localparam GET_WIDTH = 3'd2; + localparam GET_COUNT = 3'd3; + localparam GET_OFFSET = 3'd6; + + `NO_RW_RAM_CHECK reg [DATAW-1:0] data_store [SIZE-1:0]; + `NO_RW_RAM_CHECK reg [DELTAW-1:0] delta_store [SIZE-1:0]; - reg [DATAW-1:0] data_store [SIZE-1:0]; - reg [DELTAW-1:0] delta_store [SIZE-1:0]; reg [UPDW-1:0] prev_trigger_id; reg [DELTAW-1:0] delta; reg [BUSW-1:0] bus_out_r; + reg [63:0] timestamp, start_time; reg [`CLOG2(SIZE)-1:0] raddr, waddr, waddr_end; @@ -49,8 +52,7 @@ module VX_scope #( reg [BUSW-3:0] delay_val, delay_cntr; - reg [1:0] out_cmd; - + reg [2:0] get_cmd; wire [2:0] cmd_type; wire [BUSW-4:0] cmd_data; assign {cmd_data, cmd_type} = bus_in; @@ -59,7 +61,7 @@ module VX_scope #( always @(posedge clk) begin if (reset) begin - out_cmd <= $bits(out_cmd)'(CMD_GET_VALID); + get_cmd <= $bits(get_cmd)'(CMD_GET_VALID); raddr <= 0; waddr <= 0; waddr_end <= $bits(waddr)'(SIZE-1); @@ -74,13 +76,18 @@ module VX_scope #( read_offset <= 0; read_delta <= 0; data_valid <= 0; + timestamp <= 0; end else begin + + timestamp <= timestamp + 1; + if (bus_write) begin case (cmd_type) CMD_GET_VALID, CMD_GET_DATA, CMD_GET_WIDTH, - CMD_GET_COUNT: out_cmd <= $bits(out_cmd)'(cmd_type); + CMD_GET_OFFSET, + CMD_GET_COUNT: get_cmd <= $bits(get_cmd)'(cmd_type); CMD_SET_DELAY: delay_val <= $bits(delay_val)'(cmd_data); CMD_SET_STOP: waddr_end <= $bits(waddr)'(cmd_data); default:; @@ -92,8 +99,10 @@ module VX_scope #( delta_flush <= 1; if (0 == delay_val) begin start_wait <= 0; - recording <= 1; - delay_cntr <= 0; + recording <= 1; + delta <= 0; + delay_cntr <= 0; + start_time <= timestamp; end else begin start_wait <= 1; recording <= 0; @@ -106,26 +115,29 @@ module VX_scope #( if (1 == delay_cntr) begin start_wait <= 0; recording <= 1; + delta <= 0; + start_time <= timestamp; end end if (recording) begin - if (DELTA_ENABLE) begin + if (UPDW_ENABLE) begin if (delta_flush || changed || (trigger_id != prev_trigger_id)) begin - data_store[waddr] <= data_in; delta_store[waddr] <= delta; - waddr <= waddr + 1; + data_store[waddr] <= data_in; + waddr <= waddr + $bits(waddr)'(1); delta <= 0; delta_flush <= 0; end else begin - delta <= delta + 1; + delta <= delta + DELTAW'(1); delta_flush <= (delta == (MAX_DELTA-1)); end prev_trigger_id <= trigger_id; end else begin - data_store[waddr] <= data_in; + delta_store[waddr] <= 0; + data_store[waddr] <= data_in; waddr <= waddr + 1; end @@ -134,12 +146,12 @@ module VX_scope #( waddr <= waddr; // keep last address recording <= 0; data_valid <= 1; - read_delta <= DELTA_ENABLE; + read_delta <= 1; end end if (bus_read - && (out_cmd == GET_DATA) + && (get_cmd == GET_DATA) && data_valid) begin if (read_delta) begin read_delta <= 0; @@ -148,16 +160,16 @@ module VX_scope #( if (read_offset < $bits(read_offset)'(DATAW-BUSW)) begin read_offset <= read_offset + $bits(read_offset)'(BUSW); end else begin - raddr <= raddr + 1; + raddr <= raddr + $bits(raddr)'(1); read_offset <= 0; - read_delta <= DELTA_ENABLE; + read_delta <= 1; if (raddr == waddr) begin data_valid <= 0; end end end else begin raddr <= raddr + 1; - read_delta <= DELTA_ENABLE; + read_delta <= 1; if (raddr == waddr) begin data_valid <= 0; end @@ -168,11 +180,14 @@ module VX_scope #( end always @(*) begin - case (out_cmd) + case (get_cmd) GET_VALID : bus_out_r = BUSW'(data_valid); GET_WIDTH : bus_out_r = BUSW'(DATAW); GET_COUNT : bus_out_r = BUSW'(waddr) + BUSW'(1); + GET_OFFSET: bus_out_r = BUSW'(start_time); + /* verilator lint_off WIDTH */ GET_DATA : bus_out_r = read_delta ? BUSW'(delta_store[raddr]) : BUSW'(data_store[raddr] >> read_offset); + /* verilator lint_on WIDTH */ default : bus_out_r = 0; endcase end @@ -182,7 +197,7 @@ module VX_scope #( `ifdef DBG_PRINT_SCOPE always @(posedge clk) begin if (bus_read) begin - $display("%t: scope-read: cmd=%0d, out=%0h, addr=%0d", $time, out_cmd, bus_out, raddr); + $display("%t: scope-read: cmd=%0d, addr=%0d, value=%0h", $time, get_cmd, raddr, bus_out); end if (bus_write) begin $display("%t: scope-write: cmd=%0d, value=%0d", $time, cmd_type, cmd_data); diff --git a/hw/scripts/gen_synth_configs.py b/hw/scripts/gen_synth_configs.py deleted file mode 100755 index 0b42d88a..00000000 --- a/hw/scripts/gen_synth_configs.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -import os -import glob - -config_location = 'configs' - -name_template = '{clusters}cl-{cores}c-{warps}w-{threads}t-{l2}Kl2-{dcache}Kd-{icache}Ki{name_suffix}.sh' - -template = """ - -export V_NT={threads} -export V_NW={warps} -export V_NUM_CORES_PER_CLUSTER={cores} -export V_NUM_CLUSTERS={clusters} -export V_DCACHE_SIZE_BYTES={dcachek} -export V_ICACHE_SIZE_BYTES={icachek} - -# L2 Cache size -export V_L2CACHE_SIZE_BYTES={l2k} - -{codegen} - -""" - -# cluster, cores, warps, threads, l2, dcache, icache -configs = [ - (1, 2, 8, 4, 8, 4, 1), - (1, 2, 8, 8, 8, 4, 1), - (1, 2, 8, 8, 16, 8, 1), - - (1, 4, 8, 8, 16, 4, 1), - (1, 4, 8, 8, 16, 8, 1), - (1, 4, 16, 8, 16, 8, 1), - - (2, 4, 8, 4, 8, 4, 1), - (2, 4, 8, 8, 16, 8, 1), -] - -files = glob.glob(config_location + '/*.sh') -for f in files: - os.remove(f) - -for clusters, cores, warps, threads, l2, dcache, icache in configs: - l2k, dcachek, icachek = 1024 * l2, 1024 * dcache, 1024 * icache - name_suffix = '' - with open(config_location + '/' + name_template.format(**locals()), 'w') as f: - codegen = '' - f.write(template.format(**locals())) diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json new file mode 100644 index 00000000..da7cfe73 --- /dev/null +++ b/hw/scripts/scope.json @@ -0,0 +1,195 @@ +{ + "version": 1, + "includes":[ + "../rtl/VX_config.vh", + "../rtl/VX_platform.vh", + "../rtl/VX_define.vh", + "../rtl/cache/VX_cache_config.vh" + ], + "modules": { + "top": { + "submodules": { + "vortex": {"type":"Vortex"} + } + }, + "Vortex": { + "submodules": { + "cluster": {"type":"VX_cluster", "count":"`NUM_CLUSTERS"}, + "l3cache": {"type":"VX_cache", "enabled":"`L3_ENABLE", "params":{"NUM_BANKS":"`L3NUM_BANKS"}} + } + }, + "VX_cluster": { + "submodules": { + "core": {"type":"VX_core", "count":"`NUM_CORES", "enabled":true}, + "l2cache": {"type":"VX_cache", "enabled":"`L2_ENABLE", "params":{"NUM_BANKS":"`L2NUM_BANKS"}} + } + }, + "VX_core": { + "submodules": { + "pipeline": {"type":"VX_pipeline", "enabled":true}, + "mem_unit": {"type":"VX_mem_unit", "enabled":true} + } + }, + "VX_pipeline": { + "submodules": { + "fetch": {"type":"VX_fetch", "enabled":true}, + "decode": {"type":"VX_decode", "enabled":true}, + "issue": {"type":"VX_issue", "enabled":true}, + "execute": {"type":"VX_execute", "enabled":true}, + "commit": {"type":"VX_commit", "enabled":true} + } + }, + "VX_fetch": { + "submodules": { + "warp_sched": {"type":"VX_warp_sched"}, + "icache_stage": {"type":"VX_icache_stage"} + } + }, + "VX_warp_sched": {}, + "VX_icache_stage": {}, + "VX_decode": {}, + "VX_issue": {}, + "VX_execute": { + "submodules": { + "lsu_unit": {"type":"VX_lsu_unit"}, + "gpu_unit": {"type":"VX_gpu_unit"} + } + }, + "VX_commit": {}, + "VX_lsu_unit": {}, + "VX_gpu_unit": {}, + "VX_mem_unit": { + "submodules": { + "smem": {"type":"VX_cache", "params":{"NUM_BANKS":"`SNUM_BANKS"}}, + "dcache": {"type":"VX_cache", "params":{"NUM_BANKS":"`DNUM_BANKS"}}, + "icache": {"type":"VX_cache", "params":{"NUM_BANKS":"`INUM_BANKS"}} + } + }, + "VX_cache": { + "submodules": { + "bank": {"type":"VX_bank", "count":"NUM_BANKS"} + } + }, + "VX_bank": {} + }, + "taps": { + "top": { + "!reset": 1, + "?dram_req_valid": 1, + "dram_req_addr": 32, + "dram_req_rw": 1, + "dram_req_byteen":"`VX_DRAM_BYTEEN_WIDTH", + "dram_req_data":"`VX_DRAM_LINE_WIDTH", + "dram_req_tag":"`VX_DRAM_TAG_WIDTH", + "?dram_req_ready": 1, + "?dram_rsp_valid": 1, + "dram_rsp_data":"`VX_DRAM_LINE_WIDTH", + "dram_rsp_tag":"`VX_DRAM_TAG_WIDTH", + "?dram_rsp_ready": 1, + "?snp_req_valid": 1, + "snp_req_addr": 32, + "snp_req_invalidate": 1, + "snp_req_tag":"`VX_SNP_TAG_WIDTH", + "?snp_req_ready": 1, + "?snp_rsp_valid": 1, + "snp_rsp_tag":"`VX_SNP_TAG_WIDTH", + "?snp_rsp_ready": 1, + "busy": 1 + }, + "top/vortex/cluster/core/pipeline/fetch/icache_stage": { + "?icache_req_valid": 1, + "icache_req_wid":"`NW_BITS", + "icache_req_addr": 32, + "icache_req_tag":"`ICORE_TAG_ID_BITS", + "?icache_req_ready": 1, + "?icache_rsp_valid": 1, + "icache_rsp_data": 32, + "icache_rsp_tag":"`ICORE_TAG_ID_BITS", + "?icache_rsp_ready": 1 + }, + "top/vortex/cluster/core/pipeline/fetch/warp_sched": { + "?wsched_scheduled_warp": 1, + "wsched_active_warps": "`NUM_WARPS", + "wsched_schedule_table": "`NUM_WARPS", + "wsched_schedule_ready": "`NUM_WARPS", + "wsched_warp_to_schedule": "`NW_BITS", + "wsched_warp_pc": "32" + }, + "top/vortex/cluster/core/pipeline/execute/gpu_unit": { + "?gpu_req_valid": 1, + "gpu_req_wid": "`NW_BITS", + "gpu_req_tmask": "`NUM_THREADS", + "gpu_req_op_type": "`GPU_BITS", + "gpu_req_rs1": "32", + "gpu_req_rs2": "32", + "?gpu_req_ready": 1, + "?gpu_rsp_valid": 1, + "gpu_rsp_wid": "`NW_BITS", + "gpu_rsp_tmc": "`GPU_TMC_SIZE", + "gpu_rsp_wspawn": "`GPU_WSPAWN_SIZE", + "gpu_rsp_split": "`GPU_SPLIT_SIZE", + "gpu_rsp_barrier": "`GPU_BARRIER_SIZE" + }, + "top/vortex/cluster/core/pipeline/execute/lsu_unit": { + "?dcache_req_valid":"`NUM_THREADS", + "dcache_req_wid":"`NW_BITS", + "dcache_req_pc": 32, + "dcache_req_addr":"`NUM_THREADS * 32", + "dcache_req_rw": 1, + "dcache_req_byteen":"`NUM_THREADS * 4", + "dcache_req_data": "`NUM_THREADS * 32", + "dcache_req_tag":"`DCORE_TAG_ID_BITS", + "?dcache_req_ready": 1, + "?dcache_rsp_valid":"`NUM_THREADS", + "dcache_rsp_data":"`NUM_THREADS * 32", + "dcache_rsp_tag":"`DCORE_TAG_ID_BITS", + "?dcache_rsp_ready": 1 + }, + "top/vortex/cluster/core/pipeline/issue": { + "?issue_valid": 1, + "issue_wid":"`NW_BITS", + "issue_tmask":"`NUM_THREADS", + "issue_pc": 32, + "issue_ex_type":"`EX_BITS", + "issue_op_type":"`OP_BITS", + "issue_op_mod":"`MOD_BITS", + "issue_wb": 1, + "issue_rd":"`NR_BITS", + "issue_rs1":"`NR_BITS", + "issue_rs2":"`NR_BITS", + "issue_rs3":"`NR_BITS", + "issue_imm": 32, + "issue_rs1_is_pc": 1, + "issue_rs2_is_imm": 1, + "?issue_ready": 1, + "?gpr_rsp_valid": 1, + "gpr_rsp_wid":"`NW_BITS", + "gpr_rsp_pc": 32, + "gpr_rsp_a":"`NUM_THREADS * 32", + "gpr_rsp_b":"`NUM_THREADS * 32", + "gpr_rsp_c":"`NUM_THREADS * 32", + "!gpr_delay": 1, + "?writeback_valid": 1, + "writeback_wid":"`NW_BITS", + "writeback_pc": 32, + "writeback_rd":"`NR_BITS", + "writeback_data":"`NUM_THREADS * 32", + "!scoreboard_delay": 1, + "!execute_delay": 1 + }, + "top/vortex/l3cache/bank, top/vortex/cluster/l2cache/bank, top/vortex/cluster/core/mem_unit/dcache/bank, top/vortex/cluster/core/mem_unit/icache/bank, top/vortex/cluster/core/mem_unit/smem/bank": { + "?valid_st0": 1, + "?valid_st1": 1, + "?valid_st2": 1, + "addr_st0": 32, + "addr_st1": 32, + "addr_st2": 32, + "is_mrvq_st1": 1, + "miss_st1": 1, + "dirty_st1": 1, + "!force_miss_st1": 1, + "!stall_pipe": 1 + } + } + } + \ No newline at end of file diff --git a/hw/scripts/scope.py b/hw/scripts/scope.py new file mode 100755 index 00000000..8be8381f --- /dev/null +++ b/hw/scripts/scope.py @@ -0,0 +1,830 @@ +#!/usr/bin/env python3 +import os +import re +import json +import argparse +import math + +vl_include_re = re.compile(r"^\s*`include\s+\"(.+)\"") +vl_define_re = re.compile(r"^\s*`define\s+(\w+)(\([\w\s,]*\))?(.*)") +vl_ifdef_re = re.compile(r"^\s*`(ifdef|ifndef|elsif)\s+(\w+)\s*$") +vl_endif_re = re.compile(r"^\s*`(endif|else)\s*$") +vl_expand_re = re.compile(r"`([0-9a-zA-Z_]+)") + +exclude_files = [] +include_dirs = [] +macros = [] +br_stack = [] + +def translate_ternary(text): + + def skip_space(text, i, ln, step): + while (i >= 0) and (i < ln): + c = text[i] + if not c.isspace(): + break + i += step + return i + + def skip_expr(text, i, ln, step): + paren = 0 + checkparen = True + while (i >= 0) and (i < ln): + c = text[i] + if checkparen and (((step < 0) and (c == ')')) or ((step > 0) and (c == '('))): + paren += 1 + elif checkparen and (((step < 0) and (c == '(')) or ((step > 0) and (c == ')'))): + if (0 == paren): + break + paren -= 1 + if (0 == paren): + i = skip_space(text, i + step, ln, step) + checkparen = False + continue + elif (0 == paren) and not (c.isalnum() or (c == '_')): + break + i += step + return (i - step) + + def parse_ternary(text): + ternary = None + ln = len(text) + for i in range(1, ln): + c = text[i] + if not (c == '?'): + continue + # parse condition expression + i0 = skip_space(text, i - 1, ln, -1) + if (i < 0): + raise Exception("invalid condition expression") + i1 = skip_expr(text, i0, ln, -1) + if (i1 > i0): + raise Exception("invalid condition expression") + # parse true expression + i2 = skip_space(text, i + 1, ln, 1) + if (i2 >= ln): + raise Exception("invalid true expression") + i3 = skip_expr(text, i2, ln, 1) + if (i3 < i2): + raise Exception("invalid true expression") + # parse colon + i4 = skip_space(text, i3 + 1, ln, 1) + if (i4 >= ln): + raise Exception("invalid colon") + if not (text[i4] == ':'): + raise Exception("missing colon") + # parse false expression + i5 = skip_space(text, i4 + 1, ln, 1) + if (i5 >= ln): + raise Exception("invalid false expression") + i6 = skip_expr(text, i5, ln, 1) + if (i6 < i5): + raise Exception("invalid false expression") + ternary = (i0, i1, i2, i3, i5, i6) + break + return ternary + + while True: + pos = parse_ternary(text) + if pos is None: + break + # convert to python ternary + newText = text[:pos[1]] + text[pos[2]:pos[3]+1] + " if " + text[pos[1]:pos[0]+1] + " else " + text[pos[4]:pos[5]+1] + text[pos[5]+1:] + text = newText + + return text + +def parse_func_args(text): + args = [] + arg = '' + l = len(text) + if text[0] != '(': + raise Exception("missing leading parenthesis: " + text) + paren = 1 + for i in range(1, l): + c = text[i] + if c == '(': + paren += 1 + elif c == ')': + if paren == 0: + raise Exception("mismatched parenthesis: (" + i + ") " + text) + paren -= 1 + if paren == 0: + l = i + break + if c == ',' and paren == 1: + if arg.strip(): + args.append(arg) + arg = '' + else: + arg += c + if paren != 0: + raise Exception("missing closing parenthesis: " + text) + if arg.strip(): + args.append(arg) + + return (args, l) + +def resolve_include_path(filename, parent_dir): + if os.path.basename(filename) in exclude_files: + return None + if os.path.isfile(filename): + return os.path.abspath(filename) + search_dirs = include_dirs + if parent_dir: + search_dirs.append(parent_dir) + for dir in search_dirs: + filepath = os.path.join(dir, filename) + if os.path.isfile(filepath): + return os.path.abspath(filepath) + raise Exception("couldn't find include file: " + filename) + +def remove_comments(text): + text = re.sub(re.compile("/\*.*?\*/",re.DOTALL ), "", text) # multiline + text = re.sub(re.compile("//.*?\n" ), "\n", text) # singleline + return text + +def add_macro(name, args, value): + macro = (name, args, value) + macros.append(macro) + if not args is None: + print("*** token: " + name + "(", end='') + for i in range(len(args)): + if i > 0: + print(', ', end='') + print(args[i], end='') + print(")=" + value) + else: + print("*** token: " + name + "=" + value) + +def find_macro(name): + for macro in macros: + if macro[0] == name: + return macro + return None + +def expand_text(text, params): + + def re_pattern_args(args): + p = "(? 0: + p += "|" + p += arg + i += 1 + p += ")(?![0-9a-zA-Z_])" + return p + + class DoReplParam(object): + def __init__(self, params): + self.params = params + self.expanded = False + def __call__(self, match): + name = match.group(1) + self.expanded = True + return self.params[name] + + class DoReplMacro(object): + def __init__(self): + self.expanded = False + self.has_func = False + def __call__(self, match): + name = match.group(1) + macro = find_macro(name) + if macro: + if not macro[1] is None: + self.has_func = True + else: + self.expanded = True + return macro[2] + return "`" + name + + def repl_func_macro(text): + expanded = False + match = re.search(vl_expand_re, text) + if match: + name = match.group(1) + macro = find_macro(name) + if macro: + args = macro[1] + value = macro[2] + if not args is None: + str_args = text[match.end():].strip() + f_args = parse_func_args(str_args) + if len(args) == 0: + if len(f_args[0]) != 0: + raise Exception("invalid argments for macro '" + name + "': value=" + text) + else: + if len(args) != len(f_args[0]): + raise Exception("mismatch number of argments for macro '" + name + "': actual=" + len(f_args[0]) + ", expected=" + len(args)) + + pattern = re_pattern_args(args) + params = {} + for i in range(len(args)): + params[args[i]] = f_args[0][i] + dorepl = DoReplParam(params) + value = re.sub(pattern, dorepl, value) + + str_head = text[0:match.start()] + str_tail = text[match.end() + f_args[1]+1:] + text = str_head + value + str_tail + expanded = True + if expanded: + return text + return None + + changed = False + iter = 0 + + while True: + if iter > 99: + raise Exception("Macro recursion!") + has_func = False + while True: + params_updated = False + if not params is None: + do_repl = DoReplParam(params) + pattern = re_pattern_args(params) + new_text = re.sub(pattern, do_repl, text) + if do_repl.expanded: + text = new_text + params_updated = True + do_repl = DoReplMacro() + new_text = re.sub(vl_expand_re, do_repl, text) + has_func = do_repl.has_func + if not (params_updated or do_repl.expanded): + break + text = new_text + changed = True + if not has_func: + break + expanded = repl_func_macro(text) + if not expanded: + break + text = expanded + changed = True + iter += 1 + + if changed: + return text + return None + +def parse_include(filename, nesting): + if nesting > 99: + raise Exception("include recursion!") + print("*** parsing '" + filename + "'...") + content = None + with open(filename, "r") as f: + content = f.read() + # remove comments + content = remove_comments(content) + # parse content + prev_line = None + for line in content.splitlines(False): + # skip empty lines + if re.match(re.compile(r'^\s*$'), line): + continue + # merge multi-line lines + if line.endswith('\\'): + if prev_line: + prev_line += line[:len(line) - 1] + else: + prev_line = line[:len(line) - 1] + continue + if prev_line: + line = prev_line + line + prev_line = None + # parse ifdef + m = re.match(vl_ifdef_re, line) + if m: + key = m.group(1) + cond = m.group(2) + taken = find_macro(cond) is not None + if key == 'ifndef': + taken = not taken + elif key == '"elsif': + br_stack.pop() + br_stack.append(taken) + print("*** " + key + "(" + cond + ") => " + str(taken)) + continue + # parse endif + m = re.match(vl_endif_re, line) + if m: + key = m.group(1) + top = br_stack.pop() + if key == 'else': + br_stack.append(not top) + print("*** " + key) + continue + # skip disabled blocks + if not all(br_stack): + continue + + # parse include + m = re.match(vl_include_re, line) + if m: + include = m.group(1) + include = resolve_include_path(include, os.path.dirname(filename)) + if include: + parse_include(include, nesting + 1) + continue + # parse define + m = re.match(vl_define_re, line) + if m: + name = m.group(1) + args = m.group(2) + if args: + args = args[1:len(args)-1].strip() + if args != '': + args = args.split(',') + for i in range(len(args)): + args[i] = args[i].strip() + else: + args = [] + value = m.group(3) + add_macro(name, args, value.strip()) + continue + +def parse_includes(includes): + # change current directory to include directory + old_dir = os.getcwd() + script_dir = os.path.dirname(os.path.realpath(__file__)) + os.chdir(script_dir) + + for include in includes: + parse_include(include, 0) + + # restore current directory + os.chdir(old_dir) + +def load_include_dirs(dirs): + for dir in dirs: + print("*** include dir: " + dir) + include_dirs.append(dir) + +def load_defines(defines): + for define in defines: + key_value = define.split('=', 2) + name = key_value[0] + value = '' + if len(key_value) == 2: + value = key_value[1] + add_macro(name, None, value) + +def load_config(filename): + with open(filename, "r") as f: + config = json.load(f) + print("condfig=", config) + return config + +def eval_node(text, params): + def clog2(x): + l2 = math.log2(x) + cl = math.ceil(l2) + return int(cl) + + if not type(text) == str: + return text + + expanded = expand_text(text, params) + if expanded: + text = expanded + + try: + __text = text.replace('$clog2', '__clog2') + __text = translate_ternary(__text) + e = eval(__text, {'__clog2': clog2}) + return e + except (NameError, SyntaxError): + return text + +def gen_vl_header(file, modules, taps): + + header = ''' +`ifndef VX_SCOPE_DEFS +`define VX_SCOPE_DEFS +''' + footer = '`endif' + + def signal_size(size, mn): + if type(size) == int: + if (size != mn): + return "[" + str(size-1) + ":0]" + else: + return "" + else: + return "[" + size + "-1:0]" + + def create_signal(key, ports): + if not key in ports: + ports[key] = [] + return ports[key] + + def dic_insert(gdic, ldic, key, value, enabled): + if enabled: + ldic[key] = value + if key in gdic: + return False + if enabled: + gdic[key] = None + return True + + def trigger_name(name, size): + if type(size) == int: + if size != 1: + return "(| " + name + ")" + else: + return name + else: + return "(| " + name + ")" + + def trigger_subscripts(asize): + def Q(arr, ss, asize, idx, N): + a = asize[idx] + if (a != 0): + for i in range(a): + tmp = ss + '[' + str(i) + ']' + if (idx + 1) < N: + Q(arr, tmp, asize, idx + 1, N) + else: + arr.append(tmp) + else: + if (idx + 1) < N: + Q(arr, ss, asize, idx + 1, N) + else: + arr.append(ss) + + if asize is None: + return [""] + ln = len(asize) + if (0 == ln): + return [""] + arr = [] + Q(arr, "", asize, 0, ln) + return arr + + + def visit_path(alltaps, ports, ntype, paths, modules, taps): + curtaps = {} + + if (len(paths) != 0): + spath = paths.pop(0) + snodes = modules[ntype]["submodules"] + if not spath in snodes: + raise Exception("invalid path: " + spath + " in " + ntype) + + snode = snodes[spath] + + stype = snode["type"] + + enabled = True + if "enabled" in snode: + enabled = eval_node(snode["enabled"], None) + + subtaps = visit_path(alltaps, ports, stype, paths, modules, taps) + + scount = 0 + if "count" in snode: + scount = eval_node(snode["count"], None) + + params = None + if "params" in snode: + params = snode["params"] + + new_staps = [] + + nn = "SCOPE_IO_" + ntype + pp = create_signal(nn, ports) + + for key in subtaps: + subtap = subtaps[key] + s = subtap[0] + a = subtap[1] + t = subtap[2] + + aa = [scount] + sa = signal_size(scount, 0) + if a: + for i in a: + x = eval_node(i, params) + aa.append(x) + sa += signal_size(x, 0) + + if dic_insert(alltaps, curtaps, spath + '/' + key, (s, aa, t), enabled): + skey = key.replace('/', '_') + if enabled: + pp.append("\toutput wire" + sa + signal_size(s, 1) + " scope_" + spath + '_' + skey + ',') + new_staps.append(skey) + + ports[nn] = pp + + if (0 == scount): + nn = "SCOPE_BIND_" + ntype + '_' + spath + pp = create_signal(nn, ports) + + for st in new_staps: + if enabled: + pp.append("\t.scope_" + st + "(scope_" + spath + '_' + st + "),") + else: + pp.append("\t`UNUSED_PIN (scope_" + st + "),") + + ports[nn] = pp + else: + nn = "SCOPE_BIND_" + ntype + '_' + spath + "(__i__)" + pp = create_signal(nn, ports) + + for st in new_staps: + if enabled: + pp.append("\t.scope_" + st + "(scope_" + spath + '_' + st + "[__i__]),") + else: + pp.append("\t`UNUSED_PIN (scope_" + st + "),") + + ports[nn] = pp + else: + nn = "SCOPE_IO_" + ntype + pp = create_signal(nn, ports) + + for tk in taps: + trigger = 0 + name = tk + size = eval_node(taps[tk], None) + if name[0] == '!': + name = name[1:] + trigger = 1 + elif name[0] == '?': + name = name[1:] + trigger = 2 + if dic_insert(alltaps, curtaps, name, (size, None, trigger), True): + pp.append("\toutput wire" + signal_size(size, 1) + " scope_" + name + ',') + + ports[nn] = pp + + return curtaps + + toptaps = {} + + with open(file, 'w') as f: + + ports = {} + alltaps = {} + + for key in taps: + skey_list = key.split(',') + _taps = taps[key] + for skey in skey_list: + print('processing node: ' + skey + ' ...') + paths = skey.strip().split('/') + ntype = paths.pop(0) + curtaps = visit_path(alltaps, ports, ntype, paths, modules, _taps) + for tk in curtaps: + toptaps[tk] = curtaps[tk] + + print(header, file=f) + + for key in ports: + print("`define " + key + ' \\', file=f) + for port in ports[key]: + print(port + ' \\', file=f) + print("", file=f) + + print("`define SCOPE_DECL_SIGNALS \\", file=f) + i = 0 + for key in toptaps: + tap = toptaps[key] + name = key.replace('/', '_') + size = tap[0] + asize = tap[1] + sa = "" + if asize: + for a in asize: + sa += signal_size(a, 0) + if i > 0: + print(" \\", file=f) + print('\t wire' + sa + signal_size(size, 1) + " scope_" + name + ';', file=f, end='') + i += 1 + print("", file=f) + print("", file=f) + + print("`define SCOPE_DATA_LIST \\", file=f) + i = 0 + for key in toptaps: + tap = toptaps[key] + trigger = tap[2] + if trigger != 0: + continue + name = key.replace('/', '_') + if i > 0: + print(", \\", file=f) + print("\t scope_" + name, file=f, end='') + i += 1 + print("", file=f) + print("", file=f) + + print("`define SCOPE_UPDATE_LIST \\", file=f) + i = 0 + for key in toptaps: + tap = toptaps[key] + trigger = tap[2] + if trigger == 0: + continue + name = key.replace('/', '_') + if i > 0: + print(", \\", file=f) + print("\t scope_" + name, file=f, end='') + i += 1 + print("", file=f) + print("", file=f) + + print("`define SCOPE_TRIGGER \\", file=f) + i = 0 + excluded_list = [] + for key in toptaps: + if key in excluded_list: + continue + tap = toptaps[key] + if tap[2] != 2: + continue + size = tap[0] + asize = tap[1] + sus = trigger_subscripts(asize) + for su in sus: + if i > 0: + print(" | \\", file=f) + print("\t(", file=f, end='') + name = trigger_name("scope_" + key.replace('/', '_') + su, size) + if key.endswith("_valid"): + ready_signal = key[:-6] + "_ready" + if ready_signal in toptaps: + rname = trigger_name("scope_" + ready_signal.replace('/', '_') + su, size) + print(name + " && " + rname, file=f, end='') + excluded_list.append(ready_signal) + else: + print(name, file=f, end='') + else: + print(name, file=f, end='') + print(")", file=f, end='') + i += 1 + print("", file=f) + print("", file=f) + + print(footer, file=f) + + return toptaps + +def gen_cc_header(file, taps): + + header = ''' +#pragma once + +struct scope_module_t { + const char* name; + int index; + int parent; +}; + +struct scope_tap_t { + int width; + const char* name; + int module; +}; +''' + def flatten_path(paths, sizes): + def Q(arr, ss, idx, N, paths, sizes): + size = sizes[idx] + if size != 0: + for i in range(sizes[idx]): + tmp = ss + ('/' if (ss != '') else '') + tmp += paths[idx] + '_' + str(i) + if (idx + 1) < N: + Q(arr, tmp, idx + 1, N, paths, sizes) + else: + arr.append(tmp) + else: + tmp = ss + ('/' if (ss != '') else '') + tmp += paths[idx] + if (idx + 1) < N: + Q(arr, tmp, idx + 1, N, paths, sizes) + else: + arr.append(tmp) + + arr = [] + Q(arr, "", 0, len(asize), paths, asize) + return arr + + # flatten the taps + fdic = {} + for key in taps: + tap = taps[key] + size = str(tap[0]) + trigger = tap[2] + if (trigger != 0): + continue + paths = key.split('/') + if (len(paths) > 1): + name = paths.pop(-1) + asize = tap[1] + for ss in flatten_path(paths, asize): + fdic[ss + '/' + name ] = [size, 0] + else: + fdic[key] = [size, 0] + for key in taps: + tap = taps[key] + size = str(tap[0]) + trigger = tap[2] + if (trigger == 0): + continue + paths = key.split('/') + if (len(paths) > 1): + name = paths.pop(-1) + asize = tap[1] + for ss in flatten_path(paths, asize): + fdic[ss + '/' + name ] = [size, 0] + else: + fdic[key] = [size, 0] + + # generate module dic + mdic = {} + mdic["*"] = ("*", 0, -1) + for key in fdic: + paths = key.split('/') + if len(paths) == 1: + continue + paths.pop(-1) + parent = 0 + mk = "" + for path in paths: + mk += '/' + path + if not mk in mdic: + index = len(mdic) + mdic[mk] = (path, index, parent) + parent = index + else: + parent = mdic[mk][1] + fdic[key][1] = parent + + with open(file, 'w') as f: + print(header, file=f) + + print("static constexpr scope_module_t scope_modules[] = {", file=f) + i = 0 + for key in mdic: + m = mdic[key] + if i > 0: + print(',', file=f) + print("\t{\"" + m[0] + "\", " + str(m[1]) + ", " + str(m[2]) + "}", file=f, end='') + i += 1 + print("", file=f) + print("};", file=f) + + print("", file=f) + print("static constexpr scope_tap_t scope_taps[] = {", file=f) + i = 0 + for key in fdic: + size = fdic[key][0] + parent = fdic[key][1] + paths = key.split('/') + if len(paths) > 1: + name = paths.pop(-1) + else: + name = key + if i > 0: + print(',', file=f) + print("\t{" + size + ", \"" + name + "\", " + str(parent) + "}", file=f, end='') + i += 1 + print("", file=f) + print("};", file=f) + +def main(): + parser = argparse.ArgumentParser(description='Scope headers generator.') + parser.add_argument('-vl', nargs='?', default='scope-defs.vh', metavar='file', help='Output Verilog header') + parser.add_argument('-cc', nargs='?', default='scope-defs.h', metavar='file', help='Output C++ header') + parser.add_argument('-D', nargs='?', action='append', metavar='macro[=value]', help='define macro') + parser.add_argument('-I', nargs='?', action='append', metavar='', help='include directory') + parser.add_argument('config', help='Json config file') + args = parser.parse_args() + print("args=", args) + + global exclude_files + global include_dirs + global macros + global br_stack + + if args.I: + load_include_dirs(args.I) + + if args.D: + load_defines(args.D) + + config = load_config(args.config) + + exclude_files.append(os.path.basename(args.vl)) + + if "includes" in config: + parse_includes(config["includes"]) + + taps = gen_vl_header(args.vl, config["modules"], config["taps"]) + gen_cc_header(args.cc, taps) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index 2b78f017..88ac722c 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -13,6 +13,8 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSRQ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE +DBG_PRINT_FLAGS += -DDBG_CORE_REQ_INFO +DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO @@ -42,7 +44,7 @@ gen-s: verilator $(VF) -DNDEBUG $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG $(SINGLECORE)' gen-sd: - verilator $(VF) -O0 $(SINGLECORE) -CFLAGS '$(CF) -O0 -g $(DBG) $(SINGLECORE)' --trace $(DBG) + verilator $(VF) -O0 $(SINGLECORE) -CFLAGS '$(CF) -O0 -g $(DBG) $(SINGLECORE)' --trace-fst --trace-threads 1 $(DBG) gen-st: verilator $(VF) -DNDEBUG $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(SINGLECORE)' --threads $(THREADS) @@ -51,7 +53,7 @@ gen-m: verilator $(VF) -DNDEBUG $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG $(MULTICORE)' gen-md: - verilator $(VF) $(MULTICORE) -CFLAGS '$(CF) -O0 -g $(DBG) $(MULTICORE)' --trace $(DBG) + verilator $(VF) $(MULTICORE) -CFLAGS '$(CF) -O0 -g $(DBG) $(MULTICORE)' --trace-fst --trace-threads 1 $(DBG) gen-mt: verilator $(VF) -DNDEBUG $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(MULTICORE)' --threads $(THREADS) @@ -75,11 +77,12 @@ build-mt: gen-mt (cd obj_dir && make -j -f VVortex.mk) run: run-s + run-s: build-s (cd obj_dir && ./VVortex) run-sd: build-sd - (cd obj_dir && valgrind ./VVortex) + (cd obj_dir && ./VVortex) run-st: build-st (cd obj_dir && ./VVortex) diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 60fde196..4f6403e7 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -28,15 +28,11 @@ Simulator::Simulator() { ram_ = nullptr; vortex_ = new VVortex(); - dram_rsp_active_ = false; - snp_req_active_ = false; - csr_req_active_ = false; - #ifdef VCD_OUTPUT Verilated::traceEverOn(true); - trace_ = new VerilatedVcdC(); + trace_ = new VerilatedFstC(); vortex_->trace(trace_, 99); - trace_->open("trace.vcd"); + trace_->open("trace.fst"); #endif // reset the device @@ -66,12 +62,35 @@ void Simulator::reset() { std::cout << timestamp << ": [sim] reset()" << std::endl; #endif - vortex_->reset = 1; - this->step(); - vortex_->reset = 0; - + print_bufs_.clear(); dram_rsp_vec_.clear(); + dram_rsp_active_ = false; + snp_req_active_ = false; + csr_req_active_ = false; + + snp_req_size_ = 0; + pending_snp_reqs_ = 0; + csr_rsp_value_ = nullptr; + + vortex_->dram_rsp_valid = 0; + vortex_->dram_req_ready = 0; + vortex_->io_req_ready = 0; + vortex_->io_rsp_valid = 0; + vortex_->snp_req_valid = 0; + vortex_->snp_rsp_ready = 0; + vortex_->csr_io_req_valid = 0; + vortex_->csr_io_rsp_ready = 0; + + vortex_->reset = 1; + + vortex_->clk = 0; + this->eval(); + vortex_->clk = 1; + this->eval(); + + vortex_->reset = 0; + // Turn on assertion after reset Verilated::assertOn(true); } @@ -79,10 +98,9 @@ void Simulator::reset() { void Simulator::step() { vortex_->clk = 0; this->eval(); - vortex_->clk = 1; this->eval(); - + this->eval_dram_bus(); this->eval_io_bus(); this->eval_csr_bus(); @@ -104,14 +122,13 @@ void Simulator::eval_dram_bus() { } // schedule DRAM responses - int dequeue_index = -1; - for (int i = 0; i < dram_rsp_vec_.size(); i++) { - if (dram_rsp_vec_[i].cycles_left > 0) { - dram_rsp_vec_[i].cycles_left -= 1; + std::list::iterator dram_rsp_it(dram_rsp_vec_.end()); + for (auto it = dram_rsp_vec_.begin(), ie = dram_rsp_vec_.end(); it != ie; ++it) { + if (it->cycles_left > 0) { + it->cycles_left -= 1; } - if ((dequeue_index == -1) - && (dram_rsp_vec_[i].cycles_left == 0)) { - dequeue_index = i; + if ((dram_rsp_it == ie) && (it->cycles_left == 0)) { + dram_rsp_it = it; } } @@ -122,11 +139,11 @@ void Simulator::eval_dram_bus() { dram_rsp_active_ = false; } if (!dram_rsp_active_) { - if (dequeue_index != -1) { + if (dram_rsp_it != dram_rsp_vec_.end()) { vortex_->dram_rsp_valid = 1; - memcpy((uint8_t*)vortex_->dram_rsp_data, dram_rsp_vec_[dequeue_index].block.data(), GLOBAL_BLOCK_SIZE); - vortex_->dram_rsp_tag = dram_rsp_vec_[dequeue_index].tag; - dram_rsp_vec_.erase(dram_rsp_vec_.begin() + dequeue_index); + memcpy((uint8_t*)vortex_->dram_rsp_data, dram_rsp_it->block.data(), GLOBAL_BLOCK_SIZE); + vortex_->dram_rsp_tag = dram_rsp_it->tag; + dram_rsp_vec_.erase(dram_rsp_it); dram_rsp_active_ = true; } else { vortex_->dram_rsp_valid = 0; @@ -161,7 +178,7 @@ void Simulator::eval_dram_bus() { dram_req.cycles_left = DRAM_LATENCY; dram_req.tag = vortex_->dram_req_tag; ram_->read(vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE, GLOBAL_BLOCK_SIZE, dram_req.block.data()); - dram_rsp_vec_.push_back(dram_req); + dram_rsp_vec_.emplace_back(dram_req); } } } @@ -199,7 +216,7 @@ void Simulator::eval_snp_bus() { #endif } if (vortex_->snp_req_valid && vortex_->snp_req_ready) { - if (snp_req_size_) { + if (snp_req_size_ != 0) { vortex_->snp_req_addr += 1; vortex_->snp_req_tag += 1; --snp_req_size_; @@ -272,7 +289,7 @@ void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) { vortex_->snp_req_valid = 1; vortex_->snp_rsp_ready = 1; - snp_req_size_ = (size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE; + snp_req_size_ = (size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE; --snp_req_size_; pending_snp_reqs_ = 1; diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index cfea9bec..0dcf8a3b 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -5,13 +5,14 @@ #include "verilated.h" #ifdef VCD_OUTPUT -#include +#include #endif #include #include "ram.h" #include +#include #include #include #include @@ -62,7 +63,7 @@ private: void eval_csr_bus(); void eval_snp_bus(); - std::vector dram_rsp_vec_; + std::list dram_rsp_vec_; bool dram_rsp_active_; bool snp_req_active_; @@ -75,6 +76,6 @@ private: RAM *ram_; VVortex *vortex_; #ifdef VCD_OUTPUT - VerilatedVcdC *trace_; + VerilatedFstC *trace_; #endif }; \ No newline at end of file diff --git a/hw/syn/quartus/project.tcl b/hw/syn/quartus/project.tcl index 93102ec5..0e85bf48 100644 --- a/hw/syn/quartus/project.tcl +++ b/hw/syn/quartus/project.tcl @@ -33,7 +33,7 @@ set_global_assignment -name TOP_LEVEL_ENTITY $opts(top) set_global_assignment -name PROJECT_OUTPUT_DIRECTORY bin set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009 -set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS OFF +set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON set_global_assignment -name VERILOG_MACRO QUARTUS set_global_assignment -name VERILOG_MACRO SYNTHESIS set_global_assignment -name VERILOG_MACRO NDEBUG @@ -43,17 +43,19 @@ set_global_assignment -name VERILOG_MACRO FPU_FAST set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0 set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100 set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)" -set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" -set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON -set_global_assignment -name FITTER_EFFORT "STANDARD FIT" set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON -set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON set_global_assignment -name POWER_USE_TA_VALUE 65 set_global_assignment -name SEED 1 +set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON +set_global_assignment -name FITTER_EFFORT "STANDARD FIT" +set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" +set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED +set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM +set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" set idx 0 foreach arg $q_args_orig { diff --git a/hw/syn/quartus/top/Makefile b/hw/syn/quartus/top/Makefile index 6258682f..544cea65 100644 --- a/hw/syn/quartus/top/Makefile +++ b/hw/syn/quartus/top/Makefile @@ -51,7 +51,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" + quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/yosys/synth.ys b/hw/syn/yosys/synth.ys index 958f0353..f3ac0b0e 100644 --- a/hw/syn/yosys/synth.ys +++ b/hw/syn/yosys/synth.ys @@ -1,17 +1,22 @@ +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_bypass_buffer.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_cam_buffer.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_countones.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_divide.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_encoder_onehot.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_elastic_buffer.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_fair_arbiter.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_fixed_arbiter.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_generic_queue.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_generic_register.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_generic_stack.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_indexable_queue.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_index_queue.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_matrix_arbiter.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_mult.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_multiplier.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_onehot_encooder.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_priority_encoder.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_rr_arbiter.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_scope.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_serial_div.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_shift_register.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_skid_buffer.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_bank.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_bank_core_req_arb.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache.v @@ -20,114 +25,72 @@ read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I.. read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache_dram_fill_arb.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache_dram_req_arb.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache_miss_resrv.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_prefetcher.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_snp_forwarder.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_snp_rsp_arb.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_tag_data_access.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_tag_data_structure.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_backend_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_branch_rsp_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_tag_data_store.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_alu_req_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_branch_ctl_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_core_req_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_core_rsp_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_dram_req_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_dram_rsp_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_snp_req_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_snp_rsp_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cmt_to_csr_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_csr_io_req_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_csr_io_rsp_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_csr_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_exec_unit_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_gpr_read_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_gpu_inst_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_inst_meta_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_jal_rsp_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_csr_to_issue_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_decode_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_exu_to_cmt_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_fpu_req_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_fpu_to_cmt_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_fpu_to_csr_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_gpr_req_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_gpr_rsp_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_gpu_req_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_ifetch_req_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_ifetch_rsp_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_join_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_lsu_req_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_mul_req_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_warp_ctl_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_wb_if.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_writeback_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_wstall_if.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_alu_unit.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_back_end.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_cluster.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_commit.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_core.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_csr_arb.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_csr_data.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_csr_io_arb.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_csr_pipe.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_d_e_reg.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_csr_unit.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_dcache_arb.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_decode.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_exec_unit.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_f_d_reg.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_execute.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_fetch.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_front_end.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_fpu_unit.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_gpr_bypass.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_gpr_fp_ctrl.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_gpr_ram.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_gpr_stage.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_gpr_wrapper.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_gpu_inst.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_i_d_reg.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_gpu_unit.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_ibuffer.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_icache_stage.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_inst_multiplex.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_instr_demux.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_io_arb.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_ipdom_stack.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_issue.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_lsu_unit.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_mem_arb.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_mem_unit.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_mul_unit.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_pipeline.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_scheduler.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_user_config.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_warp.v +read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_scoreboard.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_warp_sched.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/VX_writeback.v read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/Vortex.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_bank.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_bank_core_req_arb.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache_core_req_bank_sel.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache_core_rsp_merge.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache_dram_fill_arb.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache_dram_req_arb.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_cache_miss_resrv.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_prefetcher.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_snp_forwarder.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_snp_rsp_arb.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_tag_data_access.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/cache/VX_tag_data_structure.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_backend_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_branch_rsp_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_core_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_core_rsp_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_dram_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_dram_rsp_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_snp_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_cache_snp_rsp_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_csr_io_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_csr_io_rsp_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_csr_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_exec_unit_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_gpr_read_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_gpu_inst_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_inst_meta_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_jal_rsp_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_join_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_lsu_req_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_warp_ctl_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_wb_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/interfaces/VX_wstall_if.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_countones.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_divide.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_encoder_onehot.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_fair_arbiter.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_fixed_arbiter.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_generic_queue.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_generic_register.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_generic_stack.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_indexable_queue.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_matrix_arbiter.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_mult.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_priority_encoder.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_rr_arbiter.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/libs/VX_scope.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/tex_unit/VX_tex_mgr.v -read_verilog -sv -I../../rtl/libs -I../../rtl/cache -I../../rtl/interfaces -I../../rtl ../../rtl/tex_unit/VX_tex_unit.v hierarchy -check -top Vortex add -global_input reset 1 proc -global_arst reset