#include "opae_sim.h" #include #include "Vvortex_afu_shim.h" #include "Vvortex_afu_shim__Syms.h" #ifdef VCD_OUTPUT #include #endif #include #include #include #include #define RAMULATOR #include #include #include #include #include #include #include #include #include #ifndef MEMORY_BANKS #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS #else #define MEMORY_BANKS 2 #endif #endif #ifndef MEM_CYCLE_RATIO #define MEM_CYCLE_RATIO -1 #endif #undef MEM_BLOCK_SIZE #define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) #define CACHE_BLOCK_SIZE 64 #define CCI_LATENCY 8 #define CCI_RAND_MOD 8 #define CCI_RQ_SIZE 16 #define CCI_WQ_SIZE 16 #ifndef TRACE_START_TIME #define TRACE_START_TIME 0ull #endif #ifndef TRACE_STOP_TIME #define TRACE_STOP_TIME -1ull #endif #ifndef VERILATOR_RESET_VALUE #define VERILATOR_RESET_VALUE 2 #endif #define RAM_PAGE_SIZE 4096 using namespace vortex; static uint64_t timestamp = 0; double sc_time_stamp() { return timestamp; } static void *__aligned_malloc(size_t alignment, size_t size) { // reserve margin for alignment and storing of unaligned address size_t margin = (alignment-1) + sizeof(void*); void *unaligned_addr = malloc(size + margin); void **aligned_addr = (void**)((uintptr_t)(((uint8_t*)unaligned_addr) + margin) & ~(alignment-1)); aligned_addr[-1] = unaligned_addr; return aligned_addr; } static void __aligned_free(void *ptr) { // retreive the stored unaligned address and use it to free the allocation void* unaligned_addr = ((void**)ptr)[-1]; free(unaligned_addr); } /////////////////////////////////////////////////////////////////////////////// static bool trace_enabled = false; static uint64_t trace_start_time = TRACE_START_TIME; static uint64_t trace_stop_time = TRACE_STOP_TIME; bool sim_trace_enabled() { if (timestamp >= trace_start_time && timestamp < trace_stop_time) return true; return trace_enabled; } void sim_trace_enable(bool enable) { trace_enabled = enable; } /////////////////////////////////////////////////////////////////////////////// class opae_sim::Impl { public: Impl() : stop_(false) , host_buffer_ids_(0) { // force random values for unitialized signals Verilated::randReset(VERILATOR_RESET_VALUE); Verilated::randSeed(50); // turn off assertion before reset Verilated::assertOn(false); // create RTL module instance device_ = new Vvortex_afu_shim(); #ifdef VCD_OUTPUT Verilated::traceEverOn(true); trace_ = new VerilatedVcdC(); device_->trace(this->trace, 99); trace_->open("trace.vcd"); #endif ram_ = new RAM(RAM_PAGE_SIZE); // initialize dram simulator ramulator::Config ram_config; ram_config.add("standard", "DDR4"); ram_config.add("channels", std::to_string(MEMORY_BANKS)); ram_config.add("ranks", "1"); ram_config.add("speed", "DDR4_2400R"); ram_config.add("org", "DDR4_4Gb_x8"); ram_config.add("mapping", "defaultmapping"); ram_config.set_core_num(1); dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE); Stats::statlist.output("ramulator.ddr4.log"); // reset the device this->reset(); // launch execution thread future_ = std::async(std::launch::async, [&]{ while (!stop_) { std::lock_guard guard(mutex_); this->tick(); } }); } ~Impl() { stop_ = true; if (future_.valid()) { future_.wait(); } for (auto& buffer : host_buffers_) { __aligned_free(buffer.second.data); } #ifdef VCD_OUTPUT trace_->close(); delete trace_; #endif delete device_; delete ram_; if (dram_) { dram_->finish(); Stats::statlist.printall(); delete dram_; } } int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len); if (alloc == NULL) return -1; host_buffer_t buffer; buffer.data = (uint64_t*)alloc; buffer.size = len; buffer.ioaddr = uintptr_t(alloc); auto buffer_id = host_buffer_ids_++; host_buffers_.emplace(buffer_id, buffer); *buf_addr = alloc; *wsid = buffer_id; return 0; } void release_buffer(uint64_t wsid) { auto it = host_buffers_.find(wsid); if (it != host_buffers_.end()) { __aligned_free(it->second.data); host_buffers_.erase(it); } } void get_io_address(uint64_t wsid, uint64_t *ioaddr) { *ioaddr = host_buffers_[wsid].ioaddr; } void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { std::lock_guard guard(mutex_); device_->vcp2af_sRxPort_c0_mmioRdValid = 1; device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; this->tick(); device_->vcp2af_sRxPort_c0_mmioRdValid = 0; assert(device_->af2cp_sTxPort_c2_mmioRdValid); *value = device_->af2cp_sTxPort_c2_data; } void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { std::lock_guard guard(mutex_); device_->vcp2af_sRxPort_c0_mmioWrValid = 1; device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8); this->tick(); device_->vcp2af_sRxPort_c0_mmioWrValid = 0; } private: void reset() { cci_reads_.clear(); cci_writes_.clear(); device_->vcp2af_sRxPort_c0_mmioRdValid = 0; device_->vcp2af_sRxPort_c0_mmioWrValid = 0; device_->vcp2af_sRxPort_c0_rspValid = 0; device_->vcp2af_sRxPort_c1_rspValid = 0; device_->vcp2af_sRxPort_c0_TxAlmFull = 0; device_->vcp2af_sRxPort_c1_TxAlmFull = 0; for (int b = 0; b < MEMORY_BANKS; ++b) { pending_mem_reqs_[b].clear(); device_->avs_readdatavalid[b] = 0; device_->avs_waitrequest[b] = 0; } device_->reset = 1; for (int i = 0; i < RESET_DELAY; ++i) { device_->clk = 0; this->eval(); device_->clk = 1; this->eval(); } device_->reset = 0; // Turn on assertion after reset Verilated::assertOn(true); } void tick() { this->sRxPort_bus(); this->sTxPort_bus(); this->avs_bus(); if (!dram_queue_.empty()) { if (dram_->send(dram_queue_.front())) dram_queue_.pop(); } device_->clk = 0; this->eval(); device_->clk = 1; this->eval(); if (MEM_CYCLE_RATIO > 0) { auto cycle = timestamp / 2; if ((cycle % MEM_CYCLE_RATIO) == 0) dram_->tick(); } else { for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) dram_->tick(); } #ifndef NDEBUG fflush(stdout); #endif } void eval() { device_->eval(); #ifdef VCD_OUTPUT if (sim_trace_enabled()) { trace_->dump(timestamp); } #endif ++timestamp; } void sRxPort_bus() { // check mmio request bool mmio_req_enabled = device_->vcp2af_sRxPort_c0_mmioRdValid || device_->vcp2af_sRxPort_c0_mmioWrValid; // schedule CCI read responses std::list::iterator cci_rd_it(cci_reads_.end()); for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) { if (it->cycles_left > 0) it->cycles_left -= 1; if ((cci_rd_it == ie) && (it->cycles_left == 0)) { cci_rd_it = it; } } // schedule CCI write responses std::list::iterator cci_wr_it(cci_writes_.end()); for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) { if (it->cycles_left > 0) it->cycles_left -= 1; if ((cci_wr_it == ie) && (it->cycles_left == 0)) { cci_wr_it = it; } } // send CCI write response device_->vcp2af_sRxPort_c1_rspValid = 0; if (cci_wr_it != cci_writes_.end()) { device_->vcp2af_sRxPort_c1_rspValid = 1; device_->vcp2af_sRxPort_c1_hdr_resp_type = 0; device_->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata; cci_writes_.erase(cci_wr_it); } // send CCI read response (ensure mmio disabled) device_->vcp2af_sRxPort_c0_rspValid = 0; if (!mmio_req_enabled && (cci_rd_it != cci_reads_.end())) { device_->vcp2af_sRxPort_c0_rspValid = 1; device_->vcp2af_sRxPort_c0_hdr_resp_type = 0; memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE); device_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]); printf("\n");*/ cci_reads_.erase(cci_rd_it); } } void sTxPort_bus() { // process read requests if (device_->af2cp_sTxPort_c0_valid) { assert(!device_->vcp2af_sRxPort_c0_TxAlmFull); cci_rd_req_t cci_req; cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); cci_req.addr = device_->af2cp_sTxPort_c0_hdr_address; cci_req.mdata = device_->af2cp_sTxPort_c0_hdr_mdata; auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE); //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); cci_reads_.emplace_back(cci_req); } // process write requests if (device_->af2cp_sTxPort_c1_valid) { assert(!device_->vcp2af_sRxPort_c1_TxAlmFull); cci_wr_req_t cci_req; cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); cci_req.mdata = device_->af2cp_sTxPort_c1_hdr_mdata; auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE); memcpy(host_ptr, device_->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE); cci_writes_.emplace_back(cci_req); } // check queues overflow device_->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1)); device_->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1)); } void avs_bus() { for (int b = 0; b < MEMORY_BANKS; ++b) { // process memory responses device_->avs_readdatavalid[b] = 0; if (!pending_mem_reqs_[b].empty() && (*pending_mem_reqs_[b].begin())->ready) { auto mem_rd_it = pending_mem_reqs_[b].begin(); auto mem_req = *mem_rd_it; device_->avs_readdatavalid[b] = 1; memcpy(device_->avs_readdata[b], mem_req->data.data(), MEM_BLOCK_SIZE); uint32_t addr = mem_req->addr; pending_mem_reqs_[b].erase(mem_rd_it); delete mem_req; } // process memory requests assert(!device_->avs_read[b] || !device_->avs_write[b]); unsigned byte_addr = device_->avs_address[b] * MEM_BLOCK_SIZE; if (device_->avs_write[b]) { uint64_t byteen = device_->avs_byteenable[b]; uint8_t* data = (uint8_t*)(device_->avs_writedata[b]); for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { (*ram_)[byte_addr + i] = data[i]; } } /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, byte_addr); for (int i = 0; i < MEM_BLOCK_SIZE; i++) { printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); } printf("\n");*/ // send dram request ramulator::Request dram_req( byte_addr, ramulator::Request::Type::WRITE, 0 ); dram_queue_.push(dram_req); } if (device_->avs_read[b]) { auto mem_req = new mem_rd_req_t(); mem_req->addr = device_->avs_address[b]; ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE); mem_req->ready = false; pending_mem_reqs_[b].emplace_back(mem_req); /*printf("%0ld: [sim] MEM Rd Req: bank=%d, addr=%x, pending={", timestamp, b, mem_req.addr * MEM_BLOCK_SIZE); for (auto& req : pending_mem_reqs_[b]) { if (req.cycles_left != 0) printf(" !%0x", req.addr * MEM_BLOCK_SIZE); else printf(" %0x", req.addr * MEM_BLOCK_SIZE); } printf("}\n");*/ // send dram request ramulator::Request dram_req( byte_addr, ramulator::Request::Type::READ, std::bind([](ramulator::Request& dram_req, mem_rd_req_t* mem_req) { mem_req->ready = true; }, placeholders::_1, mem_req), 0 ); dram_queue_.push(dram_req); } device_->avs_waitrequest[b] = false; } } typedef struct { bool ready; std::array data; uint32_t addr; } mem_rd_req_t; typedef struct { int cycles_left; std::array data; uint64_t addr; uint32_t mdata; } cci_rd_req_t; typedef struct { int cycles_left; uint32_t mdata; } cci_wr_req_t; typedef struct { uint64_t* data; size_t size; uint64_t ioaddr; } host_buffer_t; std::future future_; bool stop_; std::unordered_map host_buffers_; int64_t host_buffer_ids_; std::list pending_mem_reqs_[MEMORY_BANKS]; std::list cci_reads_; std::list cci_writes_; std::mutex mutex_; RAM *ram_; ramulator::Gem5Wrapper* dram_; std::queue dram_queue_; Vvortex_afu_shim *device_; #ifdef VCD_OUTPUT VerilatedVcdC *trace_; #endif }; /////////////////////////////////////////////////////////////////////////////// opae_sim::opae_sim() : impl_(new Impl()) {} opae_sim::~opae_sim() { delete impl_; } int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { return impl_->prepare_buffer(len, buf_addr, wsid, flags); } void opae_sim::release_buffer(uint64_t wsid) { impl_->release_buffer(wsid); } void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) { impl_->get_io_address(wsid, ioaddr); } void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { impl_->write_mmio64(mmio_num, offset, value); } void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { impl_->read_mmio64(mmio_num, offset, value); }