From 5825b7c15a3221c611ec395680705c06ecf5217a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 7 Dec 2021 22:44:06 -0500 Subject: [PATCH] dram simulator fix --- driver/rtlsim/vortex.cpp | 3 +- driver/simx/vortex.cpp | 49 ++++-------- sim/common/simobject.h | 50 +++++++----- sim/rtlsim/main.cpp | 10 +-- sim/rtlsim/processor.cpp | 106 +++++++++++++++---------- sim/rtlsim/processor.h | 4 +- sim/simx/cache.cpp | 109 +++++++++++++++++++------- sim/simx/cache.h | 5 +- sim/simx/constants.h | 12 ++- sim/simx/core.cpp | 132 ++++++++++++++++++------------- sim/simx/core.h | 20 ++--- sim/simx/debug.h | 12 +-- sim/simx/execute.cpp | 162 +++++++++++++++++++-------------------- sim/simx/exeunit.cpp | 91 ++++++++++++---------- sim/simx/exeunit.h | 26 ++++--- sim/simx/ibuffer.h | 5 ++ sim/simx/main.cpp | 37 ++++----- sim/simx/memsim.cpp | 62 +++++++++------ sim/simx/memsim.h | 6 +- sim/simx/pipeline.h | 28 ++++++- sim/simx/processor.cpp | 75 +++++++++--------- sim/simx/processor.h | 16 ++-- sim/simx/scoreboard.h | 13 +++- sim/simx/sharedmem.h | 6 +- sim/simx/tex_unit.cpp | 6 ++ sim/simx/tex_unit.h | 2 + sim/simx/types.h | 75 ++++++++---------- sim/simx/warp.cpp | 30 ++++++-- sim/simx/warp.h | 12 +-- sim/vlsim/opae_sim.cpp | 37 ++++++--- 30 files changed, 702 insertions(+), 499 deletions(-) diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 52c290cd..85f7054c 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -124,8 +124,7 @@ public: future_.wait(); } // start new run - future_ = std::async(std::launch::async, [&]{ - processor_.reset(); + future_ = std::async(std::launch::async, [&]{ processor_.run(); }); return 0; diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 4b086d7e..e1897139 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -8,11 +8,17 @@ #include #include -#include -#include + #include + #include +#include +#include +#include +#include + + using namespace vortex; /////////////////////////////////////////////////////////////////////////////// @@ -59,13 +65,11 @@ public: vx_device() : arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS) , ram_(RAM_PAGE_SIZE) + , processor_(arch_) , mem_allocation_(ALLOC_BASE_ADDR) { - // setup memory simulator - memsim_ = MemSim::Create(MemSim::Config{ - DRAM_CHANNELS, - arch_.num_cores() - }); + // attach memory module + processor_.attach_ram(&ram_); } ~vx_device() { @@ -122,28 +126,7 @@ public: // start new run future_ = std::async(std::launch::async, [&]{ - if (processor_) { - // release current processor instance - processor_->MemReqPort.unbind(); - memsim_->MemRspPort.unbind(); - SimPlatform::instance().release_object(processor_); - } - - // create new processor instance - processor_ = Processor::Create(arch_); - processor_->MemReqPort.bind(&memsim_->MemReqPort); - memsim_->MemRspPort.bind(&processor_->MemRspPort); - - // attach memory object - processor_->attach_ram(&ram_); - - // run simulation - int exitcode; - for (;;) { - SimPlatform::instance().step(); - if (processor_->check_exit(&exitcode)) - break; - }; + processor_.run(); }); return 0; @@ -167,8 +150,7 @@ public: private: ArchDef arch_; RAM ram_; - MemSim::Ptr memsim_; - Processor::Ptr processor_; + Processor processor_; uint64_t mem_allocation_; std::future future_; }; @@ -207,9 +189,6 @@ extern int vx_dev_open(vx_device_h* hdevice) { if (nullptr == hdevice) return -1; - if (!SimPlatform::instance().initialize()) - return -1; - *hdevice = new vx_device(); #ifdef DUMP_PERF_STATS @@ -232,8 +211,6 @@ extern int vx_dev_close(vx_device_h hdevice) { delete device; - SimPlatform::instance().finalize(); - return 0; } diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 2830ea06..eb32302d 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -127,7 +127,7 @@ public: virtual ~SimEventBase() {} - virtual void fire() const = 0; + virtual void fire() const = 0; uint64_t time() const { return time_; @@ -219,15 +219,21 @@ public: const std::string& name() const { return name_; - } - - virtual void step(uint64_t cycle) = 0; + } protected: SimObjectBase(const SimContext& ctx, const char* name); +private: + + virtual void do_reset() = 0; + + virtual void do_tick() = 0; + std::string name_; + + friend class SimPlatform; }; /////////////////////////////////////////////////////////////////////////////// @@ -246,18 +252,22 @@ protected: : SimObjectBase(ctx, name) {} - void step(uint64_t cycle) override { - this->impl().step(cycle); - } - private: - const Impl& impl() const { - return static_cast(*this); + const Impl* impl() const { + return static_cast(this); } - Impl& impl() { - return static_cast(*this); + Impl* impl() { + return static_cast(this); + } + + void do_reset() override { + this->impl()->reset(); + } + + void do_tick() override { + this->impl()->tick(); } }; @@ -282,10 +292,6 @@ public: return true; } - void flush() { - instance().clear(); - } - void finalize() { instance().clear(); } @@ -310,7 +316,15 @@ public: events_.emplace_back(evt); } - void step() { + void reset() { + events_.clear(); + for (auto& object : objects_) { + object->do_reset(); + } + cycles_ = 0; + } + + void tick() { // evaluate events auto evt_it = events_.begin(); auto evt_it_end = events_.end(); @@ -325,7 +339,7 @@ public: } // evaluate components for (auto& object : objects_) { - object->step(cycles_); + object->do_tick(); } // advance clock ++cycles_; diff --git a/sim/rtlsim/main.cpp b/sim/rtlsim/main.cpp index c61fbec8..a3766604 100644 --- a/sim/rtlsim/main.cpp +++ b/sim/rtlsim/main.cpp @@ -49,12 +49,12 @@ int main(int argc, char **argv) { parse_args(argc, argv); - for (auto program : programs) { - std::cout << "Running " << program << "..." << std::endl; + vortex::RAM ram(RAM_PAGE_SIZE); + vortex::Processor processor; + processor.attach_ram(&ram); - vortex::RAM ram(RAM_PAGE_SIZE); - vortex::Processor processor; - processor.attach_ram(&ram); + for (auto program : programs) { + std::cout << "Running " << program << "..." << std::endl; std::string program_ext(fileExtension(program)); if (program_ext == "bin") { diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp index 7c20a442..284d599f 100644 --- a/sim/rtlsim/processor.cpp +++ b/sim/rtlsim/processor.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -39,7 +40,9 @@ #endif #endif -#define ENABLE_MEM_STALLS +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif #ifndef TRACE_START_TIME #define TRACE_START_TIME 0ull @@ -126,12 +129,7 @@ public: } ~Impl() { - for (auto& buf : print_bufs_) { - auto str = buf.second.str(); - if (!str.empty()) { - std::cout << "#" << buf.first << ": " << str << std::endl; - } - } + this->cout_flush(); #ifdef VCD_OUTPUT trace_->close(); @@ -147,10 +145,46 @@ public: } } + void cout_flush() { + for (auto& buf : print_bufs_) { + auto str = buf.second.str(); + if (!str.empty()) { + std::cout << "#" << buf.first << ": " << str << std::endl; + } + } + } + void attach_ram(RAM* ram) { ram_ = ram; } + int run() { + int exitcode = 0; + + #ifndef NDEBUG + std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; + #endif + + // reset device + this->reset(); + + // execute program + while (device_->busy) { + if (get_ebreak()) { + exitcode = get_last_wb_value(3); + break; + } + this->tick(); + } + + // wait 5 cycles to flush the pipeline + this->wait(5); + + return exitcode; + } + +private: + void reset() { print_bufs_.clear(); @@ -178,33 +212,11 @@ public: // Turn on assertion after reset Verilated::assertOn(true); + + this->cout_flush(); } - int run() { - int exitcode = 0; - - #ifndef NDEBUG - std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; - #endif - - // execute program - while (device_->busy) { - if (get_ebreak()) { - exitcode = get_last_wb_value(3); - break; - } - this->step(); - } - - // wait 5 cycles to flush the pipeline - this->wait(5); - - return exitcode; - } - -private: - - void step() { + void tick() { device_->clk = 0; this->eval(); @@ -224,7 +236,19 @@ private: this->eval_avs_bus(1); #endif - dram_->tick(); + if (MEM_CYCLE_RATIO > 0) { + auto cycle = timestamp / 2; + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); + } + + if (!dram_queue_.empty()) { + if (dram_->send(dram_queue_.front())) + dram_queue_.pop(); + } #ifndef NDEBUG fflush(stdout); @@ -372,7 +396,7 @@ private: ramulator::Request::Type::WRITE, 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } } else { // process reads @@ -393,7 +417,7 @@ private: }, placeholders::_1, mem_req), 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } } @@ -490,7 +514,7 @@ private: ramulator::Request::Type::WRITE, 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } } else { // process reads @@ -511,7 +535,7 @@ private: }, placeholders::_1, mem_req), 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } } @@ -522,7 +546,7 @@ private: void wait(uint32_t cycles) { for (int i = 0; i < cycles; ++i) { - this->step(); + this->tick(); } } @@ -574,6 +598,8 @@ private: RAM *ram_; ramulator::Gem5Wrapper* dram_; + + std::queue dram_queue_; }; /////////////////////////////////////////////////////////////////////////////// @@ -590,10 +616,6 @@ void Processor::attach_ram(RAM* mem) { impl_->attach_ram(mem); } -void Processor::reset() { - impl_->reset(); -} - int Processor::run() { return impl_->run(); } \ No newline at end of file diff --git a/sim/rtlsim/processor.h b/sim/rtlsim/processor.h index a877044f..5518990b 100644 --- a/sim/rtlsim/processor.h +++ b/sim/rtlsim/processor.h @@ -8,12 +8,10 @@ class Processor { public: Processor(); - virtual ~Processor(); + ~Processor(); void attach_ram(RAM* ram); - void reset(); - int run(); private: diff --git a/sim/simx/cache.cpp b/sim/simx/cache.cpp index 36c03eb9..34c8903c 100644 --- a/sim/simx/cache.cpp +++ b/sim/simx/cache.cpp @@ -102,6 +102,12 @@ struct block_t { struct set_t { std::vector blocks; set_t(uint32_t size) : blocks(size) {} + + void clear() { + for (auto& block : blocks) { + block.valid = false; + } + } }; struct bank_req_info_t { @@ -117,6 +123,7 @@ struct bank_req_t { uint64_t tag; uint32_t set_id; uint32_t core_id; + uint64_t uuid; std::vector infos; bank_req_t(uint32_t size) @@ -126,6 +133,7 @@ struct bank_req_t { , tag(0) , set_id(0) , core_id(0) + , uuid(0) , infos(size) {} }; @@ -142,20 +150,20 @@ struct mshr_entry_t : public bank_req_t { class MSHR { private: std::vector entries_; - uint32_t capacity_; + uint32_t size_; public: MSHR(uint32_t size) : entries_(size) - , capacity_(0) + , size_(0) {} bool empty() const { - return (0 == capacity_); + return (0 == size_); } bool full() const { - return (capacity_ == entries_.size()); + return (size_ == entries_.size()); } int lookup(const bank_req_t& bank_req) { @@ -178,7 +186,7 @@ public: entry.valid = true; entry.mshr_replay = false; entry.block_id = block_id; - ++capacity_; + ++size_; return i; } } @@ -204,12 +212,21 @@ public: if (entry.valid && entry.mshr_replay) { *out = entry; entry.valid = false; - --capacity_; + --size_; return true; } } return false; } + + void clear() { + for (auto& entry : entries_) { + if (entry.valid && entry.mshr_replay) { + entry.valid = false; + } + } + size_ = 0; + } }; struct bank_t { @@ -221,6 +238,13 @@ struct bank_t { : sets(params.sets_per_bank, params.blocks_per_set) , mshr(config.mshr_size) {} + + void clear() { + mshr.clear(); + for (auto& set : sets) { + set.clear(); + } + } }; /////////////////////////////////////////////////////////////////////////////// @@ -235,11 +259,11 @@ private: Switch::Ptr bypass_switch_; std::vector> mem_req_ports_; std::vector> mem_rsp_ports_; + uint32_t flush_cycles_; PerfStats perf_stats_; uint64_t pending_read_reqs_; uint64_t pending_write_reqs_; - uint64_t pending_fill_reqs_; - uint32_t flush_cycles_; + uint64_t pending_fill_reqs_; public: Impl(Cache* simobject, const Config& config) @@ -249,9 +273,6 @@ public: , banks_(config.num_banks, {config, params_}) , mem_req_ports_(config.num_banks, simobject) , mem_rsp_ports_(config.num_banks, simobject) - , pending_read_reqs_(0) - , pending_write_reqs_(0) - , pending_fill_reqs_(0) { bypass_switch_ = Switch::Create("bypass_arb", ArbiterType::Priority, 2); bypass_switch_->ReqOut.bind(&simobject->MemReqPort); @@ -272,19 +293,28 @@ public: // calculate tag flush cycles flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set; - } - - const PerfStats& perf_stats() const { - return perf_stats_; } - void step(uint64_t cycle) { + void reset() { + for (auto& bank : banks_) { + bank.clear(); + } + perf_stats_ = PerfStats(); + pending_read_reqs_ = 0; + pending_write_reqs_ = 0; + pending_fill_reqs_ = 0; + } + + void tick() { // wait on flush cycles if (flush_cycles_ != 0) { --flush_cycles_; return; } + // per-bank pipeline request + std::vector pipeline_reqs(config_.num_banks, config_.ports_per_bank); + // calculate memory latency perf_stats_.mem_latency += pending_fill_reqs_; @@ -294,12 +324,11 @@ public: auto& mem_rsp = bypass_port.front(); uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1); uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; - MemRsp core_rsp{tag, mem_rsp.core_id}; + MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid}; simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); bypass_port.pop(); - } - - std::vector pipeline_reqs(config_.num_banks, config_.ports_per_bank); + } // handle MSHR replay for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { @@ -351,6 +380,7 @@ public: bank_req.tag = tag; bank_req.set_id = set_id; bank_req.core_id = core_req.core_id; + bank_req.uuid = core_req.uuid; bank_req.infos.at(port_id) = {true, req_id, core_req.tag}; auto& bank = banks_.at(bank_id); @@ -400,22 +430,31 @@ public: // remove request auto time = core_req_port.pop(); - perf_stats_.pipeline_stalls += (cycle - time); + perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time); } // process active request this->processBankRequest(pipeline_reqs); + } + + const PerfStats& perf_stats() const { + return perf_stats_; } + +private: void processIORequest(const MemReq& core_req, uint32_t req_id) { { MemReq mem_req(core_req); mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id; bypass_switch_->ReqIn.at(1).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); } if (core_req.write && config_.write_reponse) { - simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1); + MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid}; + simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1); + DT(3, simobject_->name() << "-" << core_rsp); } } @@ -442,8 +481,9 @@ public: if (pipeline_req.mshr_replay) { // send core response for (auto& info : pipeline_req.infos) { - MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; - simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; + simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); } } else { bool hit = false; @@ -485,7 +525,9 @@ public: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag); mem_req.write = true; mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); } else { // mark block as dirty hit_block.dirty = true; @@ -494,8 +536,9 @@ public: // send core response if (!pipeline_req.write || config_.write_reponse) { for (auto& info : pipeline_req.infos) { - MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); } } } else { @@ -516,6 +559,7 @@ public: mem_req.write = true; mem_req.core_id = pipeline_req.core_id; mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); ++perf_stats_.evictions; } } @@ -527,13 +571,16 @@ public: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = true; mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); } // send core response if (config_.write_reponse) { for (auto& info : pipeline_req.infos) { - MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); } } } else { @@ -550,7 +597,9 @@ public: mem_req.write = false; mem_req.tag = mshr_id; mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); ++pending_fill_reqs_; } } @@ -575,8 +624,12 @@ Cache::~Cache() { delete impl_; } -void Cache::step(uint64_t cycle) { - impl_->step(cycle); +void Cache::reset() { + impl_->reset(); +} + +void Cache::tick() { + impl_->tick(); } const Cache::PerfStats& Cache::perf_stats() const { diff --git a/sim/simx/cache.h b/sim/simx/cache.h index 8f4b3932..a335b483 100644 --- a/sim/simx/cache.h +++ b/sim/simx/cache.h @@ -22,6 +22,7 @@ public: uint16_t mshr_size; // MSHR buffer size uint8_t latency; // pipeline latency }; + struct PerfStats { uint64_t reads; uint64_t writes; @@ -54,7 +55,9 @@ public: Cache(const SimContext& ctx, const char* name, const Config& config); ~Cache(); - void step(uint64_t cycle); + void reset(); + + void tick(); const PerfStats& perf_stats() const; diff --git a/sim/simx/constants.h b/sim/simx/constants.h index a28bd806..109f29f4 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -1,10 +1,16 @@ #pragma once -#include "types.h" - +#ifndef RAM_PAGE_SIZE #define RAM_PAGE_SIZE 4096 +#endif -#define DRAM_CHANNELS 2 +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif + +#ifndef MEMORY_BANKS +#define MEMORY_BANKS 2 +#endif namespace vortex { diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 0540151c..fd11befd 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -30,7 +30,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , ibuffers_(arch.num_warps(), IBUF_SIZE) , scoreboard_(arch_) , exe_units_((int)ExeType::MAX) - , icache_(Cache::Create("Icache", Cache::Config{ + , icache_(Cache::Create("icache", Cache::Config{ log2ceil(ICACHE_SIZE), // C log2ceil(L1_BLOCK_SIZE),// B 2, // W @@ -45,7 +45,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) NUM_WARPS, // mshr 2, // pipeline latency })) - , dcache_(Cache::Create("Dcache", Cache::Config{ + , dcache_(Cache::Create("dcache", Cache::Config{ log2ceil(DCACHE_SIZE), // C log2ceil(L1_BLOCK_SIZE),// B 2, // W @@ -72,15 +72,6 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , fetch_latch_("fetch") , decode_latch_("decode") , pending_icache_(arch_.num_warps()) - , active_warps_(1) - , stalled_warps_(0) - , last_schedule_wid_(0) - , issued_instrs_(0) - , committed_instrs_(0) - , csr_tex_unit_(0) - , ecall_(false) - , ebreak_(false) - , perf_mem_pending_reads_(0) { for (int i = 0; i < arch_.num_warps(); ++i) { warps_.at(i) = std::make_shared(this, i); @@ -112,10 +103,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) #endif sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i)); dcache_->CoreRspPorts.at(i).bind(&sw->RspIn); - } - - // activate warp0 - warps_.at(0)->setTmask(0, true); + } // memory perf callbacks MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){ @@ -128,9 +116,62 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) __unused (cycle); --perf_mem_pending_reads_; }); + + this->reset(); } Core::~Core() { + this->cout_flush(); +} + +void Core::reset() { + for (auto& warp : warps_) { + warp->clear(); + } + warps_.at(0)->setTmask(0, true); + active_warps_ = 1; + + for (auto& tex_unit : tex_units_) { + tex_unit.clear(); + } + + for ( auto& barrier : barriers_) { + barrier.reset(); + } + + for (auto& csr : csrs_) { + csr = 0; + } + + for (auto& fcsr : fcsrs_) { + fcsr = 0; + } + + for (auto& ibuf : ibuffers_) { + ibuf.clear(); + } + + scoreboard_.clear(); + fetch_latch_.clear(); + decode_latch_.clear(); + pending_icache_.clear(); + stalled_warps_.reset(); + last_schedule_wid_ = 0; + issued_instrs_ = 0; + committed_instrs_ = 0; + csr_tex_unit_ = 0; + ecall_ = false; + ebreak_ = false; + perf_mem_pending_reads_ = 0; + perf_stats_ = PerfStats(); +} + +void Core::attach_ram(RAM* ram) { + // bind RAM to memory unit + mmu_.attach(*ram, 0, 0xFFFFFFFF); +} + +void Core::cout_flush() { for (auto& buf : print_bufs_) { auto str = buf.second.str(); if (!str.empty()) { @@ -139,17 +180,12 @@ Core::~Core() { } } -void Core::attach_ram(RAM* ram) { - // bind RAM to memory unit - mmu_.attach(*ram, 0, 0xFFFFFFFF); -} - -void Core::step(uint64_t cycle) { - this->commit(cycle); - this->execute(cycle); - this->decode(cycle); - this->fetch(cycle); - this->schedule(cycle); +void Core::tick() { + this->commit(); + this->execute(); + this->decode(); + this->fetch(); + this->schedule(); // update perf counter perf_stats_.mem_latency += perf_mem_pending_reads_; @@ -157,9 +193,7 @@ void Core::step(uint64_t cycle) { DPN(2, std::flush); } -void Core::schedule(uint64_t cycle) { - __unused (cycle); - +void Core::schedule() { bool foundSchedule = false; int scheduled_warp = last_schedule_wid_; @@ -181,30 +215,27 @@ void Core::schedule(uint64_t cycle) { // suspend warp until decode stalled_warps_.set(scheduled_warp); - auto& warp = warps_.at(scheduled_warp); - uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_; auto trace = new pipeline_trace_t(uuid, arch_); + auto& warp = warps_.at(scheduled_warp); warp->eval(trace); - DT(3, cycle, "pipeline-schedule: " << *trace); + DT(3, "pipeline-schedule: " << *trace); // advance to fetch stage fetch_latch_.push(trace); } -void Core::fetch(uint64_t cycle) { - __unused (cycle); - +void Core::fetch() { // handle icache reponse auto& icache_rsp_port = icache_->CoreRspPorts.at(0); if (!icache_rsp_port.empty()){ auto& mem_rsp = icache_rsp_port.front(); auto trace = pending_icache_.at(mem_rsp.tag); decode_latch_.push(trace); - DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); + DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); pending_icache_.release(mem_rsp.tag); icache_rsp_port.pop(); } @@ -216,16 +247,15 @@ void Core::fetch(uint64_t cycle) { mem_req.addr = trace->PC; mem_req.write = false; mem_req.tag = pending_icache_.allocate(trace); - mem_req.core_id = id_; - icache_->CoreReqPorts.at(0).send(mem_req, 1); - DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); + mem_req.core_id = trace->cid; + mem_req.uuid = trace->uuid; + icache_->CoreReqPorts.at(0).send(mem_req, 1); + DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); fetch_latch_.pop(); } } -void Core::decode(uint64_t cycle) { - __unused (cycle); - +void Core::decode() { if (decode_latch_.empty()) return; @@ -235,7 +265,7 @@ void Core::decode(uint64_t cycle) { auto& ibuffer = ibuffers_.at(trace->wid); if (ibuffer.full()) { if (!trace->suspend()) { - DT(3, cycle, "*** ibuffer-stall: " << *trace); + DT(3, "*** ibuffer-stall: " << *trace); } ++perf_stats_.ibuf_stalls; return; @@ -257,7 +287,7 @@ void Core::decode(uint64_t cycle) { if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH) perf_stats_.branches += active_threads; - DT(3, cycle, "pipeline-decode: " << *trace); + DT(3, "pipeline-decode: " << *trace); // insert to ibuffer ibuffer.push(trace); @@ -265,9 +295,7 @@ void Core::decode(uint64_t cycle) { decode_latch_.pop(); } -void Core::execute(uint64_t cycle) { - __unused (cycle); - +void Core::execute() { // issue ibuffer instructions for (auto& ibuffer : ibuffers_) { if (ibuffer.empty()) @@ -278,7 +306,7 @@ void Core::execute(uint64_t cycle) { // check scoreboard if (scoreboard_.in_use(trace)) { if (!trace->suspend()) { - DTH(3, cycle, "*** scoreboard-stall: dependents={"); + DTH(3, "*** scoreboard-stall: dependents={"); auto uses = scoreboard_.get_uses(trace); for (uint32_t i = 0, n = uses.size(); i < n; ++i) { auto& use = uses.at(i); @@ -297,7 +325,7 @@ void Core::execute(uint64_t cycle) { // update scoreboard scoreboard_.reserve(trace); - DT(3, cycle, "pipeline-issue: " << *trace); + DT(3, "pipeline-issue: " << *trace); // push to execute units auto& exe_unit = exe_units_.at((int)trace->exe_type); @@ -308,9 +336,7 @@ void Core::execute(uint64_t cycle) { } } -void Core::commit(uint64_t cycle) { - __unused (cycle); - +void Core::commit() { // commit completed instructions bool wb = false; for (auto& exe_unit : exe_units_) { @@ -323,7 +349,7 @@ void Core::commit(uint64_t cycle) { wb |= trace->wb; // advance to commit stage - DT(3, cycle, "pipeline-commit: " << *trace); + DT(3, "pipeline-commit: " << *trace); // update scoreboard scoreboard_.release(trace); diff --git a/sim/simx/core.h b/sim/simx/core.h index b9c01383..18c9beb3 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -75,16 +75,14 @@ public: bool running() const; - void step(uint64_t cycle); + void reset(); + + void tick(); Word id() const { return id_; } - Warp& warp(int i) { - return *warps_.at(i); - } - const Decoder& decoder() { return decoder_; } @@ -125,14 +123,16 @@ public: private: - void schedule(uint64_t cycle); - void fetch(uint64_t cycle); - void decode(uint64_t cycle); - void execute(uint64_t cycle); - void commit(uint64_t cycle); + void schedule(); + void fetch(); + void decode(); + void execute(); + void commit(); void writeToStdOut(Addr addr, Word data); + void cout_flush(); + Word id_; const ArchDef arch_; const Decoder decoder_; diff --git a/sim/simx/debug.h b/sim/simx/debug.h index 53d2d62a..688eded4 100644 --- a/sim/simx/debug.h +++ b/sim/simx/debug.h @@ -33,15 +33,15 @@ } \ } while(0) -#define DT(lvl, t, x) do { \ +#define DT(lvl, x) do { \ if ((lvl) <= DEBUG_LEVEL) { \ - std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x << std::endl; \ } \ } while(0) -#define DTH(lvl, t, x) do { \ +#define DTH(lvl, x) do { \ if ((lvl) <= DEBUG_LEVEL) { \ - std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x; \ } \ } while(0) @@ -58,8 +58,8 @@ #define DPH(lvl, x) do {} while(0) #define DPN(lvl, x) do {} while(0) -#define DT(lvl, t, x) do {} while(0) -#define DTH(lvl, t, x) do {} while(0) +#define DT(lvl, x) do {} while(0) +#define DTH(lvl, x) do {} while(0) #define DTN(lvl, x) do {} while(0) #endif \ No newline at end of file diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index d1df2637..efc199d2 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -87,7 +87,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(2, "-"); continue; } - rsdata[t][i] = iRegFile_.at(t)[reg]; + rsdata[t][i] = ireg_file_.at(t)[reg]; DPN(2, std::hex << rsdata[t][i]); } DPN(2, "}" << std::endl); @@ -100,7 +100,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(2, "-"); continue; } - rsdata[t][i] = fRegFile_.at(t)[reg]; + rsdata[t][i] = freg_file_.at(t)[reg]; DPN(2, std::hex << rsdata[t][i]); } DPN(2, "}" << std::endl); @@ -460,7 +460,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); DP(4, "dest: v" << rdest); DP(4, "width" << instr.getVlsWidth()); - auto &vd = vRegFile_.at(rdest); + auto &vd = vreg_file_.at(rdest); switch (instr.getVlsWidth()) { case 6: { // load word and unit strided (not checking for unit stride) @@ -517,7 +517,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { switch (instr.getVlsWidth()) { case 6: { // store word and unit strided (not checking for unit stride) - uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); + uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i); core_->dcache_write(memAddr, value, 4); DP(4, "store: " << memAddr << " value:" << value); } break; @@ -784,7 +784,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { // predicate mode ThreadMask pred; for (int i = 0; i < num_threads; ++i) { - pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0; + pred[i] = tmask_.test(i) ? (ireg_file_.at(i).at(rsrc0) != 0) : 0; } if (pred.any()) { tmask_ &= pred; @@ -819,15 +819,15 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { trace->gpu.type = GpuType::SPLIT; trace->used_iregs.set(rsrc0); trace->fetch_stall = true; - if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { + if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) { ThreadMask tmask; for (int i = 0; i < num_threads; ++i) { - tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0); + tmask[i] = tmask_.test(i) && !ireg_file_.at(i).at(rsrc0); } DomStackEntry e(tmask, nextPC); - domStack_.push(tmask_); - domStack_.push(e); + dom_stack_.push(tmask_); + dom_stack_.push(e); for (size_t i = 0; i < e.tmask.size(); ++i) { tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); } @@ -842,7 +842,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DP(3, "*** Unanimous pred"); DomStackEntry e(tmask_); e.unanimous = true; - domStack_.push(e); + dom_stack_.push(e); } } break; case 3: { @@ -850,25 +850,25 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { trace->exe_type = ExeType::GPU; trace->gpu.type = GpuType::JOIN; trace->fetch_stall = true; - if (!domStack_.empty() && domStack_.top().unanimous) { + if (!dom_stack_.empty() && dom_stack_.top().unanimous) { DP(3, "*** Uninimous branch at join"); - tmask_ = domStack_.top().tmask; + tmask_ = dom_stack_.top().tmask; active_ = tmask_.any(); - domStack_.pop(); + dom_stack_.pop(); } else { - if (!domStack_.top().fallThrough) { - nextPC = domStack_.top().PC; + if (!dom_stack_.top().fallThrough) { + nextPC = dom_stack_.top().PC; DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec); } - tmask_ = domStack_.top().tmask; + tmask_ = dom_stack_.top().tmask; active_ = tmask_.any(); DPH(3, "*** Join: New TM="); for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); DPN(3, "\n"); - domStack_.pop(); + dom_stack_.pop(); } } break; case 4: { @@ -946,10 +946,10 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { case 0: // vector-vector switch (func6) { case 0: { - auto& vr1 = vRegFile_.at(rsrc0); - auto& vr2 = vRegFile_.at(rsrc1); - auto& vd = vRegFile_.at(rdest); - auto& mask = vRegFile_.at(0); + auto& vr1 = vreg_file_.at(rsrc0); + auto& vr2 = vreg_file_.at(rsrc1); + auto& vd = vreg_file_.at(rdest); + auto& mask = vreg_file_.at(0); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t emask = *(uint8_t *)(mask.data() + i); @@ -990,9 +990,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 24: { // vmseq - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1021,9 +1021,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 25: { // vmsne - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1052,9 +1052,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 26: { // vmsltu - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1083,9 +1083,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 27: { // vmslt - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { int8_t first = *(int8_t *)(vr1.data() + i); @@ -1114,9 +1114,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 28: { // vmsleu - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1145,9 +1145,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 29: { // vmsle - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { int8_t first = *(int8_t *)(vr1.data() + i); @@ -1176,9 +1176,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 30: { // vmsgtu - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1207,9 +1207,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 31: { // vmsgt - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { int8_t first = *(int8_t *)(vr1.data() + i); @@ -1242,9 +1242,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { switch (func6) { case 24: { // vmandnot - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1288,9 +1288,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 25: { // vmand - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1334,9 +1334,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 26: { // vmor - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1380,9 +1380,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 27: { // vmxor - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1426,9 +1426,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 28: { // vmornot - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1472,9 +1472,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 29: { // vmnand - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1518,9 +1518,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 30: { // vmnor - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1564,9 +1564,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 31: { // vmxnor - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1610,9 +1610,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 37: { // vmul - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1650,9 +1650,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 45: { // vmacc - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1693,8 +1693,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { case 6: { switch (func6) { case 0: { - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t second = *(uint8_t *)(vr2.data() + i); @@ -1729,8 +1729,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 37: { // vmul.vx - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t second = *(uint8_t *)(vr2.data() + i); @@ -1805,7 +1805,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(2, "-"); continue; } - iRegFile_.at(t)[rdest] = rddata[t]; + ireg_file_.at(t)[rdest] = rddata[t]; DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); @@ -1820,7 +1820,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(2, "-"); continue; } - fRegFile_.at(t)[rdest] = rddata[t]; + freg_file_.at(t)[rdest] = rddata[t]; DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); diff --git a/sim/simx/exeunit.cpp b/sim/simx/exeunit.cpp index 3b84ee8a..5a47dc06 100644 --- a/sim/simx/exeunit.cpp +++ b/sim/simx/exeunit.cpp @@ -12,7 +12,7 @@ using namespace vortex; NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {} -void NopUnit::step(uint64_t /*cycle*/) { +void NopUnit::tick() { if (Input.empty()) return; auto trace = Input.front(); @@ -25,26 +25,31 @@ void NopUnit::step(uint64_t /*cycle*/) { LsuUnit::LsuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "LSU") , num_threads_(core->arch().num_threads()) - , pending_dcache_(LSUQ_SIZE) + , pending_rd_reqs_(LSUQ_SIZE) , fence_lock_(false) {} -void LsuUnit::step(uint64_t cycle) { +void LsuUnit::reset() { + pending_rd_reqs_.clear(); + fence_lock_ = false; +} + +void LsuUnit::tick() { // handle dcache response for (uint32_t t = 0; t < num_threads_; ++t) { auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0); if (dcache_rsp_port.empty()) continue; auto& mem_rsp = dcache_rsp_port.front(); - auto& entry = pending_dcache_.at(mem_rsp.tag); + auto& entry = pending_rd_reqs_.at(mem_rsp.tag); auto trace = entry.first; - DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks if (0 == entry.second) { Output.send(trace, 1); - pending_dcache_.release(mem_rsp.tag); + pending_rd_reqs_.release(mem_rsp.tag); } dcache_rsp_port.pop(); } @@ -55,26 +60,26 @@ void LsuUnit::step(uint64_t cycle) { if (smem_rsp_port.empty()) continue; auto& mem_rsp = smem_rsp_port.front(); - auto& entry = pending_dcache_.at(mem_rsp.tag); + auto& entry = pending_rd_reqs_.at(mem_rsp.tag); auto trace = entry.first; - DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks if (0 == entry.second) { Output.send(trace, 1); - pending_dcache_.release(mem_rsp.tag); + pending_rd_reqs_.release(mem_rsp.tag); } smem_rsp_port.pop(); } if (fence_lock_) { // wait for all pending memory operations to complete - if (!pending_dcache_.empty()) + if (!pending_rd_reqs_.empty()) return; Output.send(fence_state_, 1); fence_lock_ = false; - DT(3, cycle, "fence-unlock: " << fence_state_); + DT(3, "fence-unlock: " << fence_state_); } // check input queue @@ -87,17 +92,17 @@ void LsuUnit::step(uint64_t cycle) { // schedule fence lock fence_state_ = trace; fence_lock_ = true; - DT(3, cycle, "fence-lock: " << *trace); + DT(3, "fence-lock: " << *trace); // remove input auto time = Input.pop(); - core_->perf_stats_.lsu_stalls += (cycle - time); + core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); return; } // check pending queue capacity - if (pending_dcache_.full()) { + if (pending_rd_reqs_.full()) { if (!trace->suspend()) { - DT(3, cycle, "*** lsu-queue-stall: " << *trace); + DT(3, "*** lsu-queue-stall: " << *trace); } return; } else { @@ -130,7 +135,7 @@ void LsuUnit::step(uint64_t cycle) { } } - auto tag = pending_dcache_.allocate({trace, valid_addrs}); + auto tag = pending_rd_reqs_.allocate({trace, valid_addrs}); for (uint32_t t = 0; t < num_threads_; ++t) { if (!trace->tmask.test(t)) @@ -145,15 +150,16 @@ void LsuUnit::step(uint64_t cycle) { mem_req.write = is_write; mem_req.non_cacheable = (type == AddrType::IO); mem_req.tag = tag; - mem_req.core_id = core_->id(); + mem_req.core_id = trace->cid; + mem_req.uuid = trace->uuid; if (type == AddrType::Shared) { core_->shared_mem_->Inputs.at(t).send(mem_req, 2); - DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); } else { dcache_req_port.send(mem_req, 2); - DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace); } @@ -163,20 +169,20 @@ void LsuUnit::step(uint64_t cycle) { // do not wait on writes if (is_write) { - pending_dcache_.release(tag); + pending_rd_reqs_.release(tag); Output.send(trace, 1); } // remove input auto time = Input.pop(); - core_->perf_stats_.lsu_stalls += (cycle - time); + core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); } /////////////////////////////////////////////////////////////////////////////// AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {} -void AluUnit::step(uint64_t cycle) { +void AluUnit::tick() { if (Input.empty()) return; auto trace = Input.front(); @@ -196,33 +202,33 @@ void AluUnit::step(uint64_t cycle) { default: std::abort(); } - DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace); + DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace); if (trace->fetch_stall) { core_->stalled_warps_.reset(trace->wid); } auto time = Input.pop(); - core_->perf_stats_.alu_stalls += (cycle - time); + core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time); } /////////////////////////////////////////////////////////////////////////////// CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {} -void CsrUnit::step(uint64_t cycle) { +void CsrUnit::tick() { if (Input.empty()) return; auto trace = Input.front(); Output.send(trace, 1); auto time = Input.pop(); - core_->perf_stats_.csr_stalls += (cycle - time); - DT(3, cycle, "pipeline-execute: op=CSR, " << *trace); + core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time); + DT(3, "pipeline-execute: op=CSR, " << *trace); } /////////////////////////////////////////////////////////////////////////////// FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {} -void FpuUnit::step(uint64_t cycle) { +void FpuUnit::tick() { if (Input.empty()) return; auto trace = Input.front(); @@ -245,9 +251,9 @@ void FpuUnit::step(uint64_t cycle) { default: std::abort(); } - DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace); + DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace); auto time = Input.pop(); - core_->perf_stats_.fpu_stalls += (cycle - time); + core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time); } /////////////////////////////////////////////////////////////////////////////// @@ -257,8 +263,12 @@ GpuUnit::GpuUnit(const SimContext& ctx, Core* core) , num_threads_(core->arch().num_threads()) , pending_tex_reqs_(TEXQ_SIZE) {} + +void GpuUnit::reset() { + pending_tex_reqs_.clear(); +} -void GpuUnit::step(uint64_t cycle) { +void GpuUnit::tick() { #ifdef EXT_TEX_ENABLE // handle memory response for (uint32_t t = 0; t < num_threads_; ++t) { @@ -268,7 +278,7 @@ void GpuUnit::step(uint64_t cycle) { auto& mem_rsp = dcache_rsp_port.front(); auto& entry = pending_tex_reqs_.at(mem_rsp.tag); auto trace = entry.first; - DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); + DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks if (0 == entry.second) { @@ -312,7 +322,7 @@ void GpuUnit::step(uint64_t cycle) { issued = true; break; case GpuType::TEX: - if (this->processTexRequest(cycle, trace)) + if (this->processTexRequest(trace)) issued = true; break; default: @@ -320,22 +330,20 @@ void GpuUnit::step(uint64_t cycle) { } if (issued) { - DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace); + DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace); if (trace->fetch_stall) { core_->stalled_warps_.reset(trace->wid); } auto time = Input.pop(); - core_->perf_stats_.fpu_stalls += (cycle - time); + core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time); } } -bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { - __unused (cycle); - +bool GpuUnit::processTexRequest(pipeline_trace_t* trace) { // check pending queue capacity if (pending_tex_reqs_.full()) { if (!trace->suspend()) { - DT(3, cycle, "*** tex-queue-stall: " << *trace); + DT(3, "*** tex-queue-stall: " << *trace); } return false; } else { @@ -356,14 +364,15 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { continue; auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1); - for (auto mem_addr : trace->mem_addrs.at(t)) { + for (auto& mem_addr : trace->mem_addrs.at(t)) { MemReq mem_req; mem_req.addr = mem_addr.addr; mem_req.write = (trace->lsu.type == LsuType::STORE); mem_req.tag = tag; mem_req.core_id = core_->id(); + mem_req.uuid = trace->uuid; dcache_req_port.send(mem_req, 3); - DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", tid=" << t << ", "<< trace); ++ core_->perf_stats_.tex_reads; ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size(); diff --git a/sim/simx/exeunit.h b/sim/simx/exeunit.h index bea714ea..78990369 100644 --- a/sim/simx/exeunit.h +++ b/sim/simx/exeunit.h @@ -18,10 +18,14 @@ public: , Input(this) , Output(this) , core_(core) - {} + {} virtual ~ExeUnit() {} + virtual void reset() {} + + virtual void tick() = 0; + protected: Core* core_; }; @@ -32,7 +36,7 @@ class NopUnit : public ExeUnit { public: NopUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -40,14 +44,16 @@ public: class LsuUnit : public ExeUnit { private: uint32_t num_threads_; - HashTable> pending_dcache_; + HashTable> pending_rd_reqs_; pipeline_trace_t* fence_state_; bool fence_lock_; public: LsuUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void reset(); + + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -56,7 +62,7 @@ class AluUnit : public ExeUnit { public: AluUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -65,7 +71,7 @@ class CsrUnit : public ExeUnit { public: CsrUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -74,7 +80,7 @@ class FpuUnit : public ExeUnit { public: FpuUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -84,12 +90,14 @@ private: uint32_t num_threads_; HashTable> pending_tex_reqs_; - bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace); + bool processTexRequest(pipeline_trace_t* trace); public: GpuUnit(const SimContext& ctx, Core*); + + void reset(); - void step(uint64_t cycle); + void tick(); }; } \ No newline at end of file diff --git a/sim/simx/ibuffer.h b/sim/simx/ibuffer.h index b4c6f51e..7362195f 100644 --- a/sim/simx/ibuffer.h +++ b/sim/simx/ibuffer.h @@ -34,6 +34,11 @@ public: void pop() { return entries_.pop(); } + + void clear() { + std::queue empty; + std::swap(entries_, empty ); + } }; } \ No newline at end of file diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index 159fdab6..89999c8f 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -6,6 +6,8 @@ #include #include #include "processor.h" +#include "archdef.h" +#include "mem.h" #include "constants.h" #include #include "args.h" @@ -50,11 +52,14 @@ int main(int argc, char **argv) { std::cout << "Running " << imgFileName << "..." << std::endl; - if (!SimPlatform::instance().initialize()) - return -1; - { + // create processor configuation + ArchDef arch(archStr, num_cores, num_warps, num_threads); + + // create memory module RAM ram(RAM_PAGE_SIZE); + + // load program { std::string program_ext(fileExtension(imgFileName.c_str())); if (program_ext == "bin") { @@ -67,27 +72,15 @@ int main(int argc, char **argv) { } } - ArchDef arch(archStr, num_cores, num_warps, num_threads); - auto processor = Processor::Create(arch); - processor->attach_ram(&ram); - - // setup memory simulator - auto memsim = MemSim::Create(MemSim::Config{ - DRAM_CHANNELS, - arch.num_cores() - }); - processor->MemReqPort.bind(&memsim->MemReqPort); - memsim->MemRspPort.bind(&processor->MemRspPort); + // create processor + Processor processor(arch); + + // attach memory module + processor.attach_ram(&ram); // run simulation - for (;;) { - SimPlatform::instance().step(); - if (processor->check_exit(&exitcode)) - break; - }; - } - - SimPlatform::instance().finalize(); + processor.run(); + } if (riscv_test) { if (1 == exitcode) { diff --git a/sim/simx/memsim.cpp b/sim/simx/memsim.cpp index 74979bc8..a69df4b9 100644 --- a/sim/simx/memsim.cpp +++ b/sim/simx/memsim.cpp @@ -13,6 +13,7 @@ DISABLE_WARNING_POP #include "constants.h" #include "types.h" +#include "debug.h" using namespace vortex; @@ -51,37 +52,50 @@ public: return perf_stats_; } - void dram_callback(ramulator::Request& req, uint32_t tag) { - MemRsp mem_rsp{tag, (uint32_t)req.coreid}; + void dram_callback(ramulator::Request& req, uint32_t tag, uint64_t uuid) { + if (req.type == ramulator::Request::Type::WRITE) + return; + MemRsp mem_rsp{tag, (uint32_t)req.coreid, uuid}; simobject_->MemRspPort.send(mem_rsp, 1); + DT(3, simobject_->name() << "-" << mem_rsp); } - void step(uint64_t /*cycle*/) { - dram_->tick(); + void reset() { + perf_stats_ = PerfStats(); + } + + void tick() { + if (MEM_CYCLE_RATIO > 0) { + auto cycle = SimPlatform::instance().cycles(); + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); + } if (simobject_->MemReqPort.empty()) return; auto& mem_req = simobject_->MemReqPort.front(); - if (mem_req.write) { - ramulator::Request dram_req( - mem_req.addr, - ramulator::Request::Type::WRITE, - mem_req.core_id - ); - dram_->send(dram_req); + ramulator::Request dram_req( + mem_req.addr, + mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ, + std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid), + mem_req.core_id + ); + + if (!dram_->send(dram_req)) + return; + + if (mem_req.write) { ++perf_stats_.writes; } else { - ramulator::Request dram_req( - mem_req.addr, - ramulator::Request::Type::READ, - std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag), - mem_req.core_id - ); - dram_->send(dram_req); ++perf_stats_.reads; } + + DT(3, simobject_->name() << "-" << mem_req); simobject_->MemReqPort.pop(); } @@ -89,8 +103,8 @@ public: /////////////////////////////////////////////////////////////////////////////// -MemSim::MemSim(const SimContext& ctx, const Config& config) - : SimObject(ctx, "MemSim") +MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) + : SimObject(ctx, name) , MemReqPort(this) , MemRspPort(this) , impl_(new Impl(this, config)) @@ -100,6 +114,10 @@ MemSim::~MemSim() { delete impl_; } -void MemSim::step(uint64_t cycle) { - impl_->step(cycle); +void MemSim::reset() { + impl_->reset(); +} + +void MemSim::tick() { + impl_->tick(); } \ No newline at end of file diff --git a/sim/simx/memsim.h b/sim/simx/memsim.h index 24918a2e..26e21a34 100644 --- a/sim/simx/memsim.h +++ b/sim/simx/memsim.h @@ -26,10 +26,12 @@ public: SimPort MemReqPort; SimPort MemRspPort; - MemSim(const SimContext& ctx, const Config& config); + MemSim(const SimContext& ctx, const char* name, const Config& config); ~MemSim(); - void step(uint64_t cycle); + void reset(); + + void tick(); const PerfStats& perf_stats() const; diff --git a/sim/simx/pipeline.h b/sim/simx/pipeline.h index 9ac09352..18d54e21 100644 --- a/sim/simx/pipeline.h +++ b/sim/simx/pipeline.h @@ -98,14 +98,40 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) return os; } -class PipelineLatch : public Queue { +class PipelineLatch { protected: const char* name_; + std::queue queue_; public: PipelineLatch(const char* name = nullptr) : name_(name) {} + + bool empty() const { + return queue_.empty(); + } + + pipeline_trace_t* front() { + return queue_.front(); + } + + pipeline_trace_t* back() { + return queue_.back(); + } + + void push(pipeline_trace_t* value) { + queue_.push(value); + } + + void pop() { + queue_.pop(); + } + + void clear() { + std::queue empty; + std::swap(queue_, empty ); + } }; } \ No newline at end of file diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index bfda986e..a7314687 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -1,11 +1,11 @@ #include "processor.h" +#include "core.h" #include "constants.h" using namespace vortex; class Processor::Impl { private: - Processor* simobject_; std::vector cores_; std::vector l2caches_; std::vector::Ptr> l2_mem_switches_; @@ -13,12 +13,13 @@ private: Switch::Ptr l3_mem_switch_; public: - Impl(Processor* simobject, const ArchDef& arch) - : simobject_(simobject) - , cores_(arch.num_cores()) + Impl(const ArchDef& arch) + : cores_(arch.num_cores()) , l2caches_(NUM_CLUSTERS) , l2_mem_switches_(NUM_CLUSTERS) { + SimPlatform::instance().initialize(); + uint32_t num_cores = arch.num_cores(); uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; @@ -26,12 +27,15 @@ public: for (uint32_t i = 0; i < num_cores; ++i) { cores_.at(i) = Core::Create(arch, i); } - - std::vector*> mem_req_ports(1); - std::vector*> mem_rsp_ports(1); - mem_req_ports.at(0) = &simobject_->MemReqPort; - mem_rsp_ports.at(0) = &simobject_->MemRspPort; + // setup memory simulator + auto memsim = MemSim::Create("dram", MemSim::Config{ + MEMORY_BANKS, + arch.num_cores() + }); + + std::vector*> mem_req_ports(1, &memsim->MemReqPort); + std::vector*> mem_rsp_ports(1, &memsim->MemRspPort); if (L3_ENABLE) { l3cache_ = Cache::Create("l3cache", Cache::Config{ @@ -39,7 +43,7 @@ public: log2ceil(MEM_BLOCK_SIZE), // B 2, // W 0, // A - 32, // address bits + 32, // address bits L3_NUM_BANKS, // number of banks L3_NUM_PORTS, // number of ports NUM_CLUSTERS, // request size @@ -122,10 +126,8 @@ public: } } - ~Impl() {} - - void step(uint64_t cycle) { - __unused (cycle); + ~Impl() { + SimPlatform::instance().finalize(); } void attach_ram(RAM* ram) { @@ -134,28 +136,33 @@ public: } } - bool check_exit(int* exitcode) { - bool running = false; - for (auto& core : cores_) { - if (core->running()) { - running = true; + int run() { + SimPlatform::instance().reset(); + bool running; + int exitcode = 0; + do { + SimPlatform::instance().tick(); + running = false; + for (auto& core : cores_) { + if (core->running()) { + running = true; + } + if (core->check_exit()) { + exitcode = core->getIRegValue(3); + running = false; + break; + } } - if (core->check_exit()) { - *exitcode = core->getIRegValue(3); - return true; - } - } - return !running; + } while (running); + + return exitcode; } }; /////////////////////////////////////////////////////////////////////////////// -Processor::Processor(const SimContext& ctx, const ArchDef& arch) - : SimObject(ctx, "Vortex") - , MemReqPort(this) - , MemRspPort(this) - , impl_(new Impl(this, arch)) +Processor::Processor(const ArchDef& arch) + : impl_(new Impl(arch)) {} Processor::~Processor() { @@ -166,10 +173,6 @@ void Processor::attach_ram(RAM* mem) { impl_->attach_ram(mem); } -bool Processor::check_exit(int* exitcode) { - return impl_->check_exit(exitcode); -} - -void Processor::step(uint64_t cycle) { - impl_->step(cycle); +int Processor::run() { + return impl_->run(); } \ No newline at end of file diff --git a/sim/simx/processor.h b/sim/simx/processor.h index cfcde4da..46bcd735 100644 --- a/sim/simx/processor.h +++ b/sim/simx/processor.h @@ -1,22 +1,18 @@ #pragma once -#include "core.h" - namespace vortex { -class Processor : public SimObject { +class ArchDef; +class RAM; + +class Processor { public: - SimPort MemReqPort; - SimPort MemRspPort; - - Processor(const SimContext& ctx, const ArchDef& arch); + Processor(const ArchDef& arch); ~Processor(); void attach_ram(RAM* mem); - bool check_exit(int* exitcode); - - void step(uint64_t cycle); + int run(); private: class Impl; diff --git a/sim/simx/scoreboard.h b/sim/simx/scoreboard.h index b36d60b3..c468860d 100644 --- a/sim/simx/scoreboard.h +++ b/sim/simx/scoreboard.h @@ -24,11 +24,16 @@ public: , in_use_fregs_(arch.num_warps()) , in_use_vregs_(arch.num_warps()) { - for (int w = 0; w < arch.num_warps(); ++w) { - in_use_iregs_.at(w).reset(); - in_use_fregs_.at(w).reset(); - in_use_vregs_.at(w).reset(); + this->clear(); + } + + void clear() { + for (int i = 0, n = in_use_iregs_.size(); i < n; ++i) { + in_use_iregs_.at(i).reset(); + in_use_fregs_.at(i).reset(); + in_use_vregs_.at(i).reset(); } + owners_.clear(); } bool in_use(pipeline_trace_t* state) const { diff --git a/sim/simx/sharedmem.h b/sim/simx/sharedmem.h index 6106ad25..c76a29d3 100644 --- a/sim/simx/sharedmem.h +++ b/sim/simx/sharedmem.h @@ -45,7 +45,11 @@ public: virtual ~SharedMem() {} - void step(uint64_t /*cycle*/) { + void reset() { + perf_stats_ = PerfStats(); + } + + void tick() { std::vector in_used_banks(config_.num_banks); for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) { auto& core_req_port = this->Inputs.at(req_id); diff --git a/sim/simx/tex_unit.cpp b/sim/simx/tex_unit.cpp index 8dedef38..763f37a6 100644 --- a/sim/simx/tex_unit.cpp +++ b/sim/simx/tex_unit.cpp @@ -16,6 +16,12 @@ TexUnit::TexUnit(Core* core) : core_(core) {} TexUnit::~TexUnit() {} +void TexUnit::clear() { + for (auto& state : states_) { + state = 0; + } +} + uint32_t TexUnit::get_state(uint32_t state) { return states_.at(state); } diff --git a/sim/simx/tex_unit.h b/sim/simx/tex_unit.h index b41cd8c7..5bca8098 100644 --- a/sim/simx/tex_unit.h +++ b/sim/simx/tex_unit.h @@ -11,6 +11,8 @@ public: TexUnit(Core* core); ~TexUnit(); + void clear(); + uint32_t get_state(uint32_t state); void set_state(uint32_t state, uint32_t value); diff --git a/sim/simx/types.h b/sim/simx/types.h index 67a14b5d..9177dba4 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -213,67 +213,48 @@ struct MemReq { bool non_cacheable; uint32_t tag; uint32_t core_id; + uint64_t uuid; MemReq(uint64_t _addr = 0, bool _write = false, bool _non_cacheable = false, uint64_t _tag = 0, - uint32_t _core_id = 0 + uint32_t _core_id = 0, + uint64_t _uuid = 0 ) : addr(_addr) , write(_write) , non_cacheable(_non_cacheable) , tag(_tag) , core_id(_core_id) + , uuid(_uuid) {} }; +inline std::ostream &operator<<(std::ostream &os, const MemReq& req) { + os << "mem-" << (req.write ? "wr" : "rd") << ": "; + os << "addr=" << req.addr << ", tag=" << req.tag << ", core_id=" << req.core_id; + os << " (#" << std::dec << req.uuid << ")"; + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + struct MemRsp { uint64_t tag; uint32_t core_id; - MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0) + uint64_t uuid; + MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0) : tag (_tag) , core_id(_core_id) + , uuid(_uuid) {} }; -/////////////////////////////////////////////////////////////////////////////// - -template -class Queue { -protected: - std::queue queue_; - -public: - Queue() {} - - bool empty() const { - return queue_.empty(); - } - - const T& front() const { - return queue_.front(); - } - - T& front() { - return queue_.front(); - } - - const T& back() const { - return queue_.back(); - } - - T& back() { - return queue_.back(); - } - - void push(const T& value) { - queue_.push(value); - } - - void pop() { - queue_.pop(); - } -}; +inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) { + os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id; + os << " (#" << std::dec << rsp.uuid << ")"; + return os; +} /////////////////////////////////////////////////////////////////////////////// @@ -337,6 +318,14 @@ public: entry.first = false; --size_; } + + void clear() { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + entry.first = false; + } + size_ = 0; + } }; /////////////////////////////////////////////////////////////////////////////// @@ -376,7 +365,11 @@ public: } } - void step(uint64_t /*cycle*/) { + void reset() { + cursor_ = 0; + } + + void tick() { if (ReqIn.size() == 1) return; diff --git a/sim/simx/warp.cpp b/sim/simx/warp.cpp index df0c0e75..b05b1246 100644 --- a/sim/simx/warp.cpp +++ b/sim/simx/warp.cpp @@ -13,12 +13,28 @@ using namespace vortex; Warp::Warp(Core *core, Word id) : id_(id) , core_(core) - , active_(false) - , PC_(STARTUP_ADDR) - , tmask_(0) { - iRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); - fRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); - vRegFile_.resize(core_->arch().num_regs(), std::vector(core_->arch().vsize(), 0)); + , ireg_file_(core->arch().num_threads(), std::vector(core->arch().num_regs())) + , freg_file_(core->arch().num_threads(), std::vector(core->arch().num_regs())) + , vreg_file_(core->arch().num_threads(), std::vector(core->arch().vsize())) +{ + this->clear(); +} + +void Warp::clear() { + active_ = false; + PC_ = STARTUP_ADDR; + tmask_.reset(); + for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) { + for (auto& reg : ireg_file_.at(i)) { + reg = 0; + } + for (auto& reg : freg_file_.at(i)) { + reg = 0; + } + for (auto& reg : vreg_file_.at(i)) { + reg = 0; + } + } } void Warp::eval(pipeline_trace_t *trace) { @@ -55,7 +71,7 @@ void Warp::eval(pipeline_trace_t *trace) { for (int i = 0; i < core_->arch().num_regs(); ++i) { DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); for (int j = 0; j < core_->arch().num_threads(); ++j) { - DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' '); + DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' '); } DPN(4, std::endl); } diff --git a/sim/simx/warp.h b/sim/simx/warp.h index c5a54205..9e9970f3 100644 --- a/sim/simx/warp.h +++ b/sim/simx/warp.h @@ -41,6 +41,8 @@ struct vtype { class Warp { public: Warp(Core *core, Word id); + + void clear(); bool active() const { return active_; @@ -84,7 +86,7 @@ public: } Word getIRegValue(int reg) const { - return iRegFile_.at(0).at(reg); + return ireg_file_.at(0).at(reg); } void eval(pipeline_trace_t *); @@ -100,10 +102,10 @@ private: Word PC_; ThreadMask tmask_; - std::vector> iRegFile_; - std::vector> fRegFile_; - std::vector> vRegFile_; - std::stack domStack_; + std::vector> ireg_file_; + std::vector> freg_file_; + std::vector> vreg_file_; + std::stack dom_stack_; struct vtype vtype_; int vl_; diff --git a/sim/vlsim/opae_sim.cpp b/sim/vlsim/opae_sim.cpp index d165dba6..ff632bf4 100644 --- a/sim/vlsim/opae_sim.cpp +++ b/sim/vlsim/opae_sim.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #ifndef MEMORY_BANKS @@ -33,8 +34,12 @@ #endif #endif +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif + #undef MEM_BLOCK_SIZE -#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) +#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) #define CACHE_BLOCK_SIZE 64 @@ -43,8 +48,6 @@ #define CCI_RQ_SIZE 16 #define CCI_WQ_SIZE 16 -#define ENABLE_MEM_STALLS - #ifndef TRACE_START_TIME #define TRACE_START_TIME 0ull #endif @@ -144,7 +147,7 @@ public: future_ = std::async(std::launch::async, [&]{ while (!stop_) { std::lock_guard guard(mutex_); - this->step(); + this->tick(); } }); } @@ -206,7 +209,7 @@ public: device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; - this->step(); + this->tick(); device_->vcp2af_sRxPort_c0_mmioRdValid = 0; assert(device_->af2cp_sTxPort_c2_mmioRdValid); *value = device_->af2cp_sTxPort_c2_data; @@ -220,7 +223,7 @@ public: device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8); - this->step(); + this->tick(); device_->vcp2af_sRxPort_c0_mmioWrValid = 0; } @@ -257,17 +260,29 @@ private: Verilated::assertOn(true); } - void step() { + void tick() { this->sRxPort_bus(); this->sTxPort_bus(); this->avs_bus(); + + if (!dram_queue_.empty()) { + if (dram_->send(dram_queue_.front())) + dram_queue_.pop(); + } device_->clk = 0; this->eval(); device_->clk = 1; this->eval(); - dram_->tick(); + if (MEM_CYCLE_RATIO > 0) { + auto cycle = timestamp / 2; + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); + } #ifndef NDEBUG fflush(stdout); @@ -403,7 +418,7 @@ private: ramulator::Request::Type::WRITE, 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } if (device_->avs_read[b]) { @@ -431,7 +446,7 @@ private: }, placeholders::_1, mem_req), 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } device_->avs_waitrequest[b] = false; @@ -480,6 +495,8 @@ private: ramulator::Gem5Wrapper* dram_; + std::queue dram_queue_; + Vvortex_afu_shim *device_; #ifdef VCD_OUTPUT VerilatedVcdC *trace_;