diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 68bccc87..487d385c 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -11,6 +11,128 @@ namespace vortex { class SimObjectBase; +/////////////////////////////////////////////////////////////////////////////// + +class SimPortBase { +public: + virtual ~SimPortBase() {} + + SimObjectBase* module() const { + return module_; + } + + SimPortBase* peer() const { + return peer_; + } + + bool connected() const { + return (peer_ != nullptr); + } + +protected: + SimPortBase(SimObjectBase* module) + : module_(module) + , peer_(nullptr) + {} + + void connect(SimPortBase* peer) { + assert(peer_ == nullptr); + peer_ = peer; + } + + void disconnect() { + assert(peer_ == nullptr); + peer_ = nullptr; + } + + SimPortBase& operator=(const SimPortBase&) = delete; + + SimObjectBase* module_; + SimPortBase* peer_; + + template friend class SlavePort; + template friend class MasterPort; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimPort : public SimPortBase { +public: + void send(const Pkt& pkt, uint64_t delay) const; + + bool read(Pkt* out) { + if (!valid_) + return false; + *out = data_; + valid_ = false; + return true; + } + +protected: + SimPort(SimObjectBase* module) + : SimPortBase(module) + , valid_(false) + {} + + void write(const Pkt& data) { + assert(!valid_); + data_ = data; + valid_ = true; + } + + SimPort& operator=(const SimPort&) = delete; + + Pkt data_; + bool valid_; + + template friend class SimPortEvent; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SlavePort : public SimPort { +public: + SlavePort(SimObjectBase* module) : SimPort(module) {} + + void bind(SlavePort* peer) { + this->connect(peer); + } + + void unbind() { + this->disconnect(); + } + +protected: + SlavePort& operator=(const SlavePort&) = delete; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class MasterPort : public SimPort { +public: + MasterPort(SimObjectBase* module) : SimPort(module) {} + + void bind(SlavePort* peer) { + this->connect(peer); + } + + void bind(MasterPort* peer) { + this->connect(peer); + } + + void unbind() { + this->disconnect(); + } + +protected: + MasterPort& operator=(const MasterPort&) = delete; +}; + +/////////////////////////////////////////////////////////////////////////////// + class SimEventBase { public: typedef std::shared_ptr Ptr; @@ -32,16 +154,16 @@ protected: /////////////////////////////////////////////////////////////////////////////// template -class SimSimpleEvent : public SimEventBase { +class SimCallEvent : public SimEventBase { public: typedef std::function Func; template static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) { - return std::make_shared(func, pkt, delay); + return std::make_shared(func, pkt, delay); } - SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay) + SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay) : SimEventBase(delay) , func_(func) , pkt_(pkt) @@ -61,167 +183,23 @@ protected: template class SimPortEvent : public SimEventBase { public: - typedef std::function Func; - - template - static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) { - return std::make_shared(func, pkt, port_id, delay); + static Ptr Create(const SimPort* port, const Pkt& pkt, uint64_t delay) { + return std::make_shared(port, pkt, delay); } - SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) + SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t delay) : SimEventBase(delay) - , func_(func) + , port_(port) , pkt_(pkt) - , port_id_(port_id) {} void fire() const override { - func_(pkt_, port_id_); + const_cast*>(port_)->write(pkt_); } private: - Func func_; - Pkt pkt_; - uint32_t port_id_; -}; - -/////////////////////////////////////////////////////////////////////////////// - -class SimPortBase { -public: - typedef std::shared_ptr Ptr; - - virtual ~SimPortBase() {} - - SimObjectBase* module() const { - return module_; - } - - uint32_t port_id() const { - return port_id_; - } - - SimPortBase* peer() const { - return peer_; - } - - bool connected() const { - return (peer_ != nullptr); - } - - bool is_slave() const { - return is_slave_; - } - -protected: - - SimPortBase(SimObjectBase* module, bool is_slave); - - void connect(SimPortBase* peer) { - assert(peer_ == nullptr); - peer_ = peer; - } - - void disconnect() { - assert(peer_ == nullptr); - peer_ = nullptr; - } - - SimObjectBase* module_; - uint32_t port_id_; - bool is_slave_; - SimPortBase* peer_; - - template friend class MasterPort; -}; - -/////////////////////////////////////////////////////////////////////////////// - -template -class SlavePort : public SimPortBase { -public: - typedef std::shared_ptr> Ptr; - typedef std::function Func; - - static Ptr Create(SimObjectBase* module, const Func& func) { - return std::make_shared>(module, func); - } - - template - static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) { - return std::make_shared>(module, obj, entry); - } - - SlavePort(SimObjectBase* module, const Func& func) - : SimPortBase(module, true) - , func_(func) - {} - - template - SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) - : SimPortBase(module, true) - , func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2)) - {} - - SlavePort(SimObjectBase* module, SlavePort* peer) - : SimPortBase(module, false) - { - this->connect(peer); - } - - void send(const Pkt& pkt, uint64_t delay) const; - - const Func& func() const { - return func_; - } - -protected: - SlavePort& operator=(const SlavePort&); - Func func_; -}; - -/////////////////////////////////////////////////////////////////////////////// - -template -class MasterPort : public SimPortBase { -public: - typedef std::shared_ptr> Ptr; - typedef std::function Func; - - static Ptr Create() { - return std::make_shared>(module); - } - - MasterPort(SimObjectBase* module) : SimPortBase(module, false) {} - - MasterPort(SimObjectBase* module, MasterPort* peer) - : SimPortBase(module, false) - { - peer->connect(this); - } - - void bind(SlavePort* peer) { - this->connect(peer); - } - - void unbind() { - peer_->disconnect(); - this->disconnect(); - } - - void send(const Pkt& pkt, uint64_t delay) const { - assert(peer_ != nullptr); - if (peer_->is_slave()) { - auto slave = reinterpret_cast*>(peer_); - slave->send(pkt, delay); - } else { - auto master = reinterpret_cast*>(peer_); - master->send(pkt, delay); - } - } - -private: - MasterPort& operator=(const MasterPort&); + const SimPort* port_; + Pkt pkt_; }; /////////////////////////////////////////////////////////////////////////////// @@ -237,25 +215,18 @@ public: template void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay); - virtual void step(uint64_t cycle) = 0; - const std::string& name() const { return name_; } protected: - SimObjectBase(const SimContext& ctx, const char* name); + virtual void step(uint64_t cycle) = 0; - uint32_t allocate_port(SimPortBase* port) { - uint32_t id = ports_.size(); - ports_.push_back(port); - return id; - } + SimObjectBase(const SimContext& ctx, const char* name); private: std::string name_; - std::vector ports_; friend class SimPlatform; friend class SimPortBase; @@ -320,20 +291,19 @@ public: } template - void schedule(const typename SimSimpleEvent::Func& callback, + void schedule(const typename SimCallEvent::Func& callback, const Pkt& pkt, uint64_t delay) { - auto evt = SimSimpleEvent::Create(callback, pkt, delay); + auto evt = SimCallEvent::Create(callback, pkt, delay); assert(delay != 0); events_.emplace_back(evt); } template - void schedule(const typename SimPortEvent::Func& callback, + void schedule(const SimPort* port, const Pkt& pkt, - uint32_t port_id, uint64_t delay) { - auto evt = SimPortEvent::Create(callback, pkt, port_id, delay); + auto evt = SimPortEvent::Create(port, pkt, delay); assert(delay != 0); events_.emplace_back(evt); } @@ -383,13 +353,6 @@ private: /////////////////////////////////////////////////////////////////////////////// -inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave) - : module_(module) - , port_id_(module->allocate_port(this)) - , is_slave_(is_slave) - , peer_(nullptr) -{} - inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) : name_(name) {} @@ -403,18 +366,11 @@ typename SimObject::Ptr SimObject::Create(Args&&... args) { } template -void SlavePort::send(const Pkt& pkt, uint64_t delay) const { - if (func_) { - SimPlatform::instance().schedule(func_, pkt, port_id_, delay); +void SimPort::send(const Pkt& pkt, uint64_t delay) const { + if (peer_) { + reinterpret_cast*>(peer_)->send(pkt, delay); } else { - assert(peer_ != nullptr); - if (peer_->is_slave()) { - auto slave = reinterpret_cast*>(peer_); - slave->send(pkt, delay); - } else { - auto master = reinterpret_cast*>(peer_); - master->send(pkt, delay); - } + SimPlatform::instance().schedule(this, pkt, delay); } } diff --git a/sim/simX/Makefile b/sim/simX/Makefile index e42464c6..75a4a495 100644 --- a/sim/simX/Makefile +++ b/sim/simX/Makefile @@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a TOP = vx_cache_sim SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp +SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) VPATH := $(sort $(dir $(SRCS))) diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp index f139cb43..503d32c5 100644 --- a/sim/simX/cache.cpp +++ b/sim/simX/cache.cpp @@ -1,5 +1,6 @@ #include "cache.h" #include "debug.h" +#include "types.h" #include #include #include @@ -30,8 +31,7 @@ struct params_t { uint32_t offset_bits = config.B - config.W; uint32_t log2_bank_size = config.C - bank_bits; uint32_t index_bits = log2_bank_size - (config.B << config.A); - assert(log2_bank_size >= config.B); - + assert(log2_bank_size >= config.B); this->words_per_block = 1 << offset_bits; this->blocks_per_set = 1 << config.A; @@ -229,9 +229,10 @@ private: CacheConfig config_; params_t params_; std::vector banks_; - std::vector> core_reqs_; - std::pair mem_rsp_; std::vector> core_rsps_; + Switch::Ptr mem_switch_; + std::vector> mem_req_ports_; + std::vector> mem_rsp_ports_; public: Impl(Cache* simobject, const CacheConfig& config) @@ -239,16 +240,22 @@ public: , config_(config) , params_(config) , banks_(config.num_banks, {config, params_}) - , core_reqs_(config.num_inputs) , core_rsps_(config.num_inputs) - {} - - void handleMemResponse(const MemRsp& response, uint32_t) { - mem_rsp_ = {true, response}; - } - - void handleCoreRequest(const MemReq& request, uint32_t port_id) { - core_reqs_.at(port_id) = {true, request}; + , mem_req_ports_(config.num_banks, simobject) + , mem_rsp_ports_(config.num_banks, simobject) + { + if (config.num_banks > 1) { + mem_switch_ = Switch::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks); + for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { + mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i)); + mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i)); + } + mem_switch_->ReqOut.bind(&simobject->MemReqPort); + simobject->MemRspPort.bind(&mem_switch_->RspIn); + } else { + mem_req_ports_.at(0).bind(&simobject->MemReqPort); + simobject->MemRspPort.bind(&mem_rsp_ports_.at(0)); + } } void step(uint64_t /*cycle*/) { @@ -269,31 +276,29 @@ public: bank.mshr.try_pop(&active_req); } - // try schedule stall replay + // try schedule stall queue if MSHR has space if (!active_req.valid - && !bank.stall_buffer.empty()) { + && !bank.stall_buffer.empty() + && !bank.mshr.full()) { active_req = bank.stall_buffer.front(); bank.stall_buffer.pop(); } } // handle memory fills - if (mem_rsp_.first) { - mem_rsp_.first = false; - auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15); - auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31); - this->processMemoryFill(bank_id, mshr_id); + for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) { + MemRsp mem_rsp; + if (mem_rsp_ports_.at(i).read(&mem_rsp)) { + this->processMemoryFill(i, mem_rsp.tag); + } } // handle incoming core requests - for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) { - auto& entry = core_reqs_.at(i); - if (!entry.first) + for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) { + MemReq core_req; + if (!simobject_->CoreReqPorts.at(i).read(&core_req)) continue; - - entry.first = false; - auto& core_req = entry.second; auto bank_id = params_.addr_bank_id(core_req.addr); auto set_id = params_.addr_set_id(core_req.addr); auto tag = params_.addr_tag(core_req.addr); @@ -417,7 +422,7 @@ public: mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag); mem_req.write = true; mem_req.tag = 0; - simobject_->MemReqPort.send(mem_req, 1); + mem_req_ports_.at(bank_id).send(mem_req, 1); } else { // mark block as dirty hit_block.dirty = true; @@ -438,7 +443,8 @@ public: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag); mem_req.write = true; - simobject_->MemReqPort.send(mem_req, 1); + mem_req.tag = 0; + mem_req_ports_.at(bank_id).send(mem_req, 1); } } @@ -449,7 +455,7 @@ public: mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); mem_req.write = true; mem_req.tag = 0; - simobject_->MemReqPort.send(mem_req, 1); + mem_req_ports_.at(bank_id).send(mem_req, 1); } // send core response for (auto& info : active_req.infos) { @@ -467,9 +473,8 @@ public: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); mem_req.write = active_req.write; - mem_req.tag = bit_setw(0, 0, 15, bank_id); - mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id); - simobject_->MemReqPort.send(mem_req, 1); + mem_req.tag = mshr_id; + mem_req_ports_.at(bank_id).send(mem_req, 1); } } } @@ -480,12 +485,12 @@ public: /////////////////////////////////////////////////////////////////////////////// Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) - : SimObject(ctx, name) - , impl_(new Impl(this, config)) - , CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest}) + : SimObject(ctx, name) + , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) , MemReqPort(this) - , MemRspPort(this, impl_, &Impl::handleMemResponse) + , MemRspPort(this) + , impl_(new Impl(this, config)) {} Cache::~Cache() { diff --git a/sim/simX/cache.h b/sim/simX/cache.h index 1c0c82f6..58767d9f 100644 --- a/sim/simX/cache.h +++ b/sim/simX/cache.h @@ -20,11 +20,7 @@ struct CacheConfig { uint8_t latency; // pipeline latency }; -class Cache : public SimObject { -private: - class Impl; - Impl* impl_; - +class Cache : public SimObject { public: Cache(const SimContext& ctx, const char* name, const CacheConfig& config); ~Cache(); @@ -35,6 +31,10 @@ public: std::vector> CoreRspPorts; MasterPort MemReqPort; SlavePort MemRspPort; + +private: + class Impl; + Impl* impl_; }; } \ No newline at end of file diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp index af0a4441..e1333dac 100644 --- a/sim/simX/core.cpp +++ b/sim/simX/core.cpp @@ -12,13 +12,13 @@ using namespace vortex; -Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) +Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) : SimObject(ctx, "Core") , id_(id) , arch_(arch) - , decoder_(decoder) - , mem_(mem) - , shared_mem_(1, SMEM_SIZE) + , decoder_(arch) + , mmu_(0, arch.wsize(), true) + , shared_mem_(4096) , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) , csrs_(arch.num_csrs(), 0) @@ -54,9 +54,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU DCACHE_MSHR_SIZE, // mshr 2, // pipeline latency })) - , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) - , icache_rsp_port_(this, this, &Core::icache_handleCacheReponse) - , dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse}) + , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) , fetch_stage_("fetch") , decode_stage_("decode") , issue_stage_("issue") @@ -65,36 +63,34 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU , pending_icache_(arch_.num_warps()) , stalled_warps_(0) , last_schedule_wid_(0) - , pending_instrs_(0) + , issued_instrs_(0) + , committed_instrs_(0) , ebreak_(false) , stats_insts_(0) , stats_loads_(0) , stats_stores_(0) - , MemRspPort(this, &l1_mem_switch_->RspIn) - , MemReqPort(this, &l1_mem_switch_->ReqOut) + , MemRspPort(this) + , MemReqPort(this) { for (int i = 0; i < arch_.num_warps(); ++i) { warps_.at(i) = std::make_shared(this, i); } // register execute units + exe_units_.at((int)ExeType::NOP) = std::make_shared(this); exe_units_.at((int)ExeType::ALU) = std::make_shared(this); exe_units_.at((int)ExeType::LSU) = std::make_shared(this); exe_units_.at((int)ExeType::CSR) = std::make_shared(this); exe_units_.at((int)ExeType::FPU) = std::make_shared(this); exe_units_.at((int)ExeType::GPU) = std::make_shared(this); - // connect l1 caches - icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_); - for (int i = 0; i < arch_.num_threads(); ++i) { - dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i)); - } - // connect l1 switch icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]); dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]); l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort); l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort); + this->MemRspPort.bind(&l1_mem_switch_->RspIn); + l1_mem_switch_->ReqOut.bind(&this->MemReqPort); // activate warp0 warps_.at(0)->setTmask(0, true); @@ -109,31 +105,24 @@ Core::~Core() { } } -void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) { - // advance to decode stage - uint32_t wid = response.tag; - pipeline_state_t state; - pending_icache_.remove(wid, &state); - auto latency = (SimPlatform::instance().cycles() - state.icache_latency); - state.icache_latency = latency; - decode_stage_.push(state); +void Core::attach_ram(RAM* ram) { + // bind RAM to memory unit + mmu_.attach(*ram, 0, 0xFFFFFFFF); } void Core::step(uint64_t cycle) { - __unused (cycle); - D(2, "###########################################################"); - D(2, std::dec << "Core" << id_ << ": cycle: " << cycle); - - this->commit(); - this->execute(); - this->issue(); - this->decode(); - this->fetch(); + this->commit(cycle); + this->execute(cycle); + this->issue(cycle); + this->decode(cycle); + this->fetch(cycle); DPN(2, std::flush); } -void Core::warp_scheduler() { +void Core::warp_scheduler(uint64_t cycle) { + __unused (cycle); + bool foundSchedule = false; int scheduled_warp = last_schedule_wid_; @@ -159,53 +148,77 @@ void Core::warp_scheduler() { stats_insts_ += warp->getActiveThreads(); pipeline_state_t state; + state.clear(); + state.id = (issued_instrs_++ * arch_.num_cores()) + id_; + warp->eval(&state); - D(4, state); + DT(3, cycle, "pipeline-schedule: " << state); - // advance to fetch stage - ++pending_instrs_; + // advance to fetch stage fetch_stage_.push(state); } -void Core::fetch() { - // schedule icache request - pipeline_state_t state; - if (fetch_stage_.try_pop(&state)) { - state.icache_latency = SimPlatform::instance().cycles(); - MemReq mem_req; - mem_req.addr = state.PC; - mem_req.write = false; - mem_req.tag = pending_icache_.allocate(state); - icache_->CoreReqPorts.at(0).send(mem_req, 1); +void Core::fetch(uint64_t cycle) { + // handle icache reponse + { + MemRsp mem_rsp; + if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){ + pipeline_state_t state; + pending_icache_.remove(mem_rsp.tag, &state); + auto latency = (SimPlatform::instance().cycles() - state.icache_latency); + state.icache_latency = latency; + decode_stage_.push(state); + DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state); + } + } + + // send icache request + { + pipeline_state_t state; + if (fetch_stage_.try_pop(&state)) { + state.icache_latency = SimPlatform::instance().cycles(); + MemReq mem_req; + mem_req.addr = state.PC; + mem_req.write = false; + mem_req.tag = pending_icache_.allocate(state); + icache_->CoreReqPorts.at(0).send(mem_req, 1); + DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state); + } } // schedule next warp - this->warp_scheduler(); + this->warp_scheduler(cycle); } -void Core::decode() { +void Core::decode(uint64_t cycle) { + __unused (cycle); + pipeline_state_t state; if (!decode_stage_.try_pop(&state)) return; - if (state.stall_warp) { - D(3, "*** warp#" << state.wid << " fetch stalled"); - } else { - // release warp + // release warp + if (!state.stall_warp) { stalled_warps_.reset(state.wid); } + + DT(3, cycle, "pipeline-decode: " << state); // advance to issue stage issue_stage_.push(state); } -void Core::issue() { +void Core::issue(uint64_t cycle) { + __unused (cycle); + if (!issue_stage_.empty()) { // insert to ibuffer auto& state = issue_stage_.top(); auto& ibuffer = ibuffers_.at(state.wid); - if (!ibuffer.full()) { + if (ibuffer.full()) { + DT(3, cycle, "*** ibuffer-stall: " << state); + } else { ibuffer.push(state); issue_stage_.pop(); } @@ -219,8 +232,18 @@ void Core::issue() { auto& state = ibuffer.top(); // check scoreboard - if (scoreboard_.in_use(state)) + if (scoreboard_.in_use(state)) { + DTH(3, cycle, "*** scoreboard-stall: dependents={"); + auto owners = scoreboard_.owners(state); + for (uint32_t i = 0, n = owners.size(); i < n; ++i) { + if (i) DTN(3, ", "); + DTN(3, "#" << owners.at(i)); + } + DTN(3, "}, " << state << std::endl); continue; + } + + DT(3, cycle, "pipeline-issue: " << state); // update scoreboard scoreboard_.reserve(state); @@ -233,18 +256,19 @@ void Core::issue() { } } -void Core::execute() { +void Core::execute(uint64_t cycle) { // process stage inputs if (!execute_stage_.empty()) { auto& state = execute_stage_.top(); auto& exe_unit = exe_units_.at((int)state.exe_type); exe_unit->push_input(state); execute_stage_.pop(); + DT(3, cycle, "pipeline-execute: " << state); } // advance execute units for (auto& exe_unit : exe_units_) { - exe_unit->step(); + exe_unit->step(cycle); } // commit completed instructions @@ -255,18 +279,29 @@ void Core::execute() { stalled_warps_.reset(state.wid); } // advance to commit stage - commit_stage_.push(state); + commit_stage_.push(state); } } } -void Core::commit() { +void Core::commit(uint64_t cycle) { + __unused (cycle); + pipeline_state_t state; if (!commit_stage_.try_pop(&state)) return; + DT(3, cycle, "pipeline-commit: " << state); + // update scoreboard scoreboard_.release(state); + + assert(committed_instrs_ <= issued_instrs_); + ++committed_instrs_; +} + +bool Core::running() const { + return (committed_instrs_ != issued_instrs_); } Word Core::get_csr(Addr addr, int tid, int wid) { @@ -349,9 +384,9 @@ void Core::barrier(int bar_id, int count, int warp_id) { barrier.reset(); } -Word Core::icache_fetch(Addr addr) { +Word Core::icache_read(Addr addr, Size size) { Word data; - mem_.read(&data, addr, sizeof(Word), 0); + mmu_.read(&data, addr, size, 0); return data; } @@ -365,7 +400,7 @@ Word Core::dcache_read(Addr addr, Size size) { return data; } #endif - mem_.read(&data, addr, size, 0); + mmu_.read(&data, addr, size, 0); return data; } @@ -383,11 +418,7 @@ void Core::dcache_write(Addr addr, Word data, Size size) { this->writeToStdOut(addr, data); return; } - mem_.write(&data, addr, size, 0); -} - -bool Core::running() const { - return pending_instrs_; + mmu_.write(&data, addr, size, 0); } void Core::printStats() const { @@ -399,7 +430,7 @@ void Core::printStats() const { void Core::writeToStdOut(Addr addr, Word data) { uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1); - auto& ss_buf = print_bufs_.at(tid); + auto& ss_buf = print_bufs_[tid]; char c = (char)data; ss_buf << c; if (c == '\n') { diff --git a/sim/simX/core.h b/sim/simX/core.h index 913db4a6..ea1a6582 100644 --- a/sim/simX/core.h +++ b/sim/simX/core.h @@ -25,9 +25,11 @@ namespace vortex { class Core : public SimObject { public: - Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id); + Core(const SimContext& ctx, const ArchDef &arch, Word id); ~Core(); + void attach_ram(RAM* ram); + bool running() const; void step(uint64_t cycle); @@ -64,7 +66,7 @@ public: void barrier(int bar_id, int count, int warp_id); - Word icache_fetch(Addr); + Word icache_read(Addr, Size); Word dcache_read(Addr, Size); @@ -76,22 +78,21 @@ public: private: - void fetch(); - void decode(); - void issue(); - void execute(); - void commit(); + void fetch(uint64_t cycle); + void decode(uint64_t cycle); + void issue(uint64_t cycle); + void execute(uint64_t cycle); + void commit(uint64_t cycle); - void warp_scheduler(); - - void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id); + void warp_scheduler(uint64_t cycle); void writeToStdOut(Addr addr, Word data); Word id_; - const ArchDef& arch_; - const Decoder& decoder_; - MemoryUnit& mem_; + const ArchDef arch_; + const Decoder decoder_; + MemoryUnit mmu_; + #ifdef SM_ENABLE RAM shared_mem_; #endif @@ -106,8 +107,6 @@ private: Cache::Ptr icache_; Cache::Ptr dcache_; Switch::Ptr l1_mem_switch_; - SlavePort icache_rsp_port_; - std::vector> dcache_rsp_port_; PipelineStage fetch_stage_; PipelineStage decode_stage_; @@ -118,10 +117,12 @@ private: HashTable pending_icache_; WarpMask stalled_warps_; uint32_t last_schedule_wid_; - uint32_t pending_instrs_; + uint32_t issued_instrs_; + uint32_t committed_instrs_; bool ebreak_; std::unordered_map print_bufs_; + uint64_t stats_insts_; uint64_t stats_loads_; uint64_t stats_stores_; diff --git a/sim/simX/debug.h b/sim/simX/debug.h index ad7fd16f..53d2d62a 100644 --- a/sim/simX/debug.h +++ b/sim/simX/debug.h @@ -7,14 +7,15 @@ #define DEBUG_HEADER << "DEBUG " //#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " +#define TRACE_HEADER << "TRACE " +//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " + #ifndef NDEBUG #include #include -#define DX(x) x - -#define D(lvl, x) do { \ +#define DP(lvl, x) do { \ if ((lvl) <= DEBUG_LEVEL) { \ std::cout DEBUG_HEADER << x << std::endl; \ } \ @@ -32,12 +33,33 @@ } \ } while(0) +#define DT(lvl, t, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \ + } \ +} while(0) + +#define DTH(lvl, t, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \ + } \ +} while(0) + +#define DTN(lvl, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout << x; \ + } \ +} while(0) + + #else -#define DX(x) -#define D(lvl, x) do {} while(0) +#define DP(lvl, x) do {} while(0) #define DPH(lvl, x) do {} while(0) #define DPN(lvl, x) do {} while(0) -#define D_RAW(x) do {} while(0) + +#define DT(lvl, t, x) do {} while(0) +#define DTH(lvl, t, x) do {} while(0) +#define DTN(lvl, x) do {} while(0) #endif \ No newline at end of file diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index 3c76231f..6530d223 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -194,47 +194,26 @@ static const char* op_string(const Instr &instr) { namespace vortex { std::ostream &operator<<(std::ostream &os, const Instr &instr) { os << op_string(instr) << ": "; - auto opcode = instr.getOpcode(); - - auto rd_to_string = [&]() { - int rdt = instr.getRDType(); - int rd = instr.getRDest(); - switch (rdt) { - case 1: os << "r" << std::dec << rd << " <- "; break; - case 2: os << "fr" << std::dec << rd << " <- "; break; - case 3: os << "vr" << std::dec << rd << " <- "; break; - default: break; - } - }; - - auto rs_to_string = [&](int i) { - int rst = instr.getRSType(i); - int rs = instr.getRSrc(i); - switch (rst) { - case 1: os << "r" << std::dec << rs; break; - case 2: os << "fr" << std::dec << rs; break; - case 3: os << "vr" << std::dec << rs; break; - default: break; - } - }; - + auto opcode = instr.getOpcode(); if (opcode == S_INST || opcode == FS || opcode == VS) { os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- "; - rs_to_string(1); + os << instr.getRSType(1) << std::dec << instr.getRSrc(1); } else if (opcode == L_INST || opcode == FL || opcode == VL) { - rd_to_string(); + os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]"; } else { - rd_to_string(); + if (instr.getRDType() != RegType::None) { + os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; + } int i = 0; for (; i < instr.getNRSrc(); ++i) { if (i) os << ", "; - rs_to_string(i); + os << instr.getRSType(i) << std::dec << instr.getRSrc(i); } if (instr.hasImm()) { if (i) os << ", "; @@ -281,7 +260,7 @@ Decoder::Decoder(const ArchDef &arch) { v_imm_mask_ = 0x7ff; } -std::shared_ptr Decoder::decode(Word code, Word PC) const { +std::shared_ptr Decoder::decode(Word code) const { auto instr = std::make_shared(); Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_); instr->setOpcode(op); @@ -297,8 +276,8 @@ std::shared_ptr Decoder::decode(Word code, Word PC) const { auto op_it = sc_instTable.find(op); if (op_it == sc_instTable.end()) { - std::cout << std::hex << "invalid opcode: 0x" << op << ", instruction=0x" << code << ", PC=" << PC << std::endl; - std::abort(); + std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl; + return nullptr; } auto iType = op_it->second.iType; @@ -459,7 +438,5 @@ std::shared_ptr Decoder::decode(Word code, Word PC) const { std::abort(); } - D(2, "Instr 0x" << std::hex << code << ": " << *instr << std::flush); - return instr; } diff --git a/sim/simX/decode.h b/sim/simX/decode.h index d4f9f976..e481cb28 100644 --- a/sim/simX/decode.h +++ b/sim/simX/decode.h @@ -13,7 +13,7 @@ class Decoder { public: Decoder(const ArchDef &); - std::shared_ptr decode(Word code, Word PC) const; + std::shared_ptr decode(Word code) const; private: diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index 602f7f3a..ff705d82 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -75,11 +75,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { if (num_rsrcs) { for (int i = 0; i < num_rsrcs; ++i) { DPH(2, "Src Reg [" << std::dec << i << "]: "); - int type = instr.getRSType(i); + auto type = instr.getRSType(i); int reg = instr.getRSrc(i); switch (type) { - case 1: - DPH(2, "r" << std::dec << reg << "={"); + case RegType::Integer: + DPN(2, "r" << std::dec << reg << "={"); for (int t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!tmask_.test(t)) { @@ -91,8 +91,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } DPN(2, "}" << std::endl); break; - case 2: - DPH(2, "fr" << std::dec << reg << "={"); + case RegType::Float: + DPN(2, "fr" << std::dec << reg << "={"); for (int t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!tmask_.test(t)) { @@ -105,6 +105,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { DPN(2, "}" << std::endl); break; default: + std::abort(); break; } } @@ -415,7 +416,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { break; case L_INST: pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.load = 0; + pipeline_state->lsu.type = LsuType::LOAD; pipeline_state->used_iregs[rsrc0] = 1; pipeline_state->mem_addrs.resize(num_threads); for (int t = 0; t < num_threads; ++t) { @@ -425,7 +426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; Word data_read = core_->dcache_read(memAddr, 4); pipeline_state->mem_addrs.at(t) = memAddr; - D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); switch (func3) { case 0: // LBI @@ -455,7 +456,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { break; case S_INST: pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.store = 1; + pipeline_state->lsu.type = LsuType::STORE; pipeline_state->used_iregs[rsrc0] = 1; pipeline_state->used_iregs[rsrc1] = 1; pipeline_state->mem_addrs.resize(num_threads); @@ -464,7 +465,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { continue; Word memAddr = rsdata[t][0] + immsrc; pipeline_state->mem_addrs.at(t) = memAddr; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); switch (func3) { case 0: // SB @@ -543,12 +544,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { break; case FENCE: pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.fence = 1; + pipeline_state->lsu.type = LsuType::FENCE; pipeline_state->stall_warp = true; break; case (FL | VL): pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.load = 1; + pipeline_state->lsu.type = LsuType::LOAD; pipeline_state->used_iregs[rsrc0] = 1; if (func3 == 0x2) { pipeline_state->mem_addrs.resize(num_threads); @@ -558,14 +559,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { Word memAddr = rsdata[t][0] + immsrc; pipeline_state->mem_addrs.at(t) = memAddr; Word data_read = core_->dcache_read(memAddr, 4); - D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); rddata[t] = data_read; } } else { - D(3, "Executing vector load"); - D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); - D(3, "dest: v" << rdest); - D(3, "width" << instr.getVlsWidth()); + DP(3, "Executing vector load"); + DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); + DP(3, "dest: v" << rdest); + DP(3, "width" << instr.getVlsWidth()); pipeline_state->mem_addrs.resize(vl_); auto &vd = vRegFile_.at(rdest); switch (instr.getVlsWidth()) { @@ -574,9 +575,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); pipeline_state->mem_addrs.at(i) = memAddr; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); Word data_read = core_->dcache_read(memAddr, 4); - D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); int *result_ptr = (int *)(vd.data() + i); *result_ptr = data_read; } @@ -590,7 +591,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { break; case (FS | VS): pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.store = 1; + pipeline_state->lsu.type = LsuType::STORE; pipeline_state->used_iregs[rsrc0] = 1; pipeline_state->used_iregs[rsrc1] = 1; if (func3 == 0x2) { @@ -601,20 +602,20 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { Word memAddr = rsdata[t][0] + immsrc; pipeline_state->mem_addrs.at(t) = memAddr; core_->dcache_write(memAddr, rsdata[t][1], 4); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); } } else { pipeline_state->mem_addrs.resize(vl_); for (int i = 0; i < vl_; i++) { Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); pipeline_state->mem_addrs.at(i) = memAddr; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); switch (instr.getVlsWidth()) { case 6: { //store word and unit strided (not checking for unit stride) uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); core_->dcache_write(memAddr, value, 4); - D(3, "store: " << memAddr << " value:" << value); + DP(3, "store: " << memAddr << " value:" << value); } break; default: std::abort(); @@ -705,9 +706,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { // FMV.X.W rddata[t] = rsdata[t][0]; - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; - } + } + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; break; case 0x50: switch(func3) { @@ -783,132 +784,138 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } rd_write = true; break; - case GPGPU: - pipeline_state->exe_type = ExeType::GPU; + case GPGPU: { + pipeline_state->exe_type = ExeType::GPU; + int ts = 0; for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - switch (func3) { - case 0: { - // TMC - pipeline_state->gpu.type = GpuType::TMC; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->stall_warp = true; - if (rsrc1) { - // predicate mode - ThreadMask pred; - for (int i = 0; i < num_threads; ++i) { - pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0; - } - if (pred.any()) { - tmask_ &= pred; - } - } else { - tmask_.reset(); - for (int i = 0; i < num_threads; ++i) { - tmask_.set(i, rsdata.at(t)[0] & (1 << i)); - } - } - D(3, "*** TMC " << tmask_); - active_ = tmask_.any(); - break; // runOnce - } break; - case 1: { - // WSPAWN - pipeline_state->gpu.type = GpuType::WSPAWN; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->stall_warp = true; - int active_warps = std::min(rsdata.at(t)[0], core_->arch().num_warps()); - D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(t)[1]); - for (int i = 1; i < active_warps; ++i) { - Warp &newWarp = core_->warp(i); - newWarp.setPC(rsdata[t][1]); - newWarp.setTmask(0, true); - } - break; // runOnce - } break; - case 2: { - // SPLIT - pipeline_state->gpu.type = GpuType::SPLIT; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->stall_warp = true; - if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { - ThreadMask tmask; - for (int i = 0; i < num_threads; ++i) { - tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0); - } - - DomStackEntry e(tmask, nextPC); - domStack_.push(tmask_); - domStack_.push(e); - for (size_t i = 0; i < e.tmask.size(); ++i) { - tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); - } - active_ = tmask_.any(); - - DPH(3, "*** Split: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); - DPN(3, ", Pushed TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1)); - DPN(3, ", PC=0x" << std::hex << e.PC << "\n"); - } else { - D(3, "*** Unanimous pred"); - DomStackEntry e(tmask_); - e.unanimous = true; - domStack_.push(e); - } - break; // runOnce - } break; - case 3: { - // JOIN - pipeline_state->gpu.type = GpuType::JOIN; - pipeline_state->stall_warp = true; - if (!domStack_.empty() && domStack_.top().unanimous) { - D(3, "*** Uninimous branch at join"); - tmask_ = domStack_.top().tmask; - active_ = tmask_.any(); - domStack_.pop(); - } else { - if (!domStack_.top().fallThrough) { - nextPC = domStack_.top().PC; - D(3, "*** Join: next PC: " << std::hex << nextPC << std::dec); - } - - tmask_ = domStack_.top().tmask; - active_ = tmask_.any(); - - DPH(3, "*** Join: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); - DPN(3, "\n"); - - domStack_.pop(); - } - break; // runOnce - } break; - case 4: { - // BAR - pipeline_state->gpu.type = GpuType::BAR; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->stall_warp = true; - active_ = false; - core_->barrier(rsdata[t][0], rsdata[t][1], id_); - break; // runOnce - } break; - case 6: { - // PREFETCH - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.prefetch = 1; - pipeline_state->used_iregs[rsrc0] = 1; - int addr = rsdata[t][0]; - printf("*** PREFETCHED %d ***\n", addr); - } break; - default: - std::abort(); + if (tmask_.test(t)) { + ts = t; + break; } } - break; + switch (func3) { + case 0: { + // TMC + pipeline_state->gpu.type = GpuType::TMC; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->stall_warp = true; + if (rsrc1) { + // predicate mode + ThreadMask pred; + for (int i = 0; i < num_threads; ++i) { + pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0; + } + if (pred.any()) { + tmask_ &= pred; + } + } else { + tmask_.reset(); + for (int i = 0; i < num_threads; ++i) { + tmask_.set(i, rsdata.at(ts)[0] & (1 << i)); + } + } + DPH(3, "*** New TMC: "); + for (int i = 0; i < num_threads; ++i) + DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, std::endl); + + active_ = tmask_.any(); + } break; + case 1: { + // WSPAWN + pipeline_state->gpu.type = GpuType::WSPAWN; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->stall_warp = true; + int active_warps = std::min(rsdata.at(ts)[0], core_->arch().num_warps()); + DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]); + for (int i = 1; i < active_warps; ++i) { + Warp &newWarp = core_->warp(i); + newWarp.setPC(rsdata[ts][1]); + newWarp.setTmask(0, true); + } + } break; + case 2: { + // SPLIT + pipeline_state->gpu.type = GpuType::SPLIT; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->stall_warp = true; + if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { + ThreadMask tmask; + for (int i = 0; i < num_threads; ++i) { + tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0); + } + + DomStackEntry e(tmask, nextPC); + domStack_.push(tmask_); + domStack_.push(e); + for (size_t i = 0; i < e.tmask.size(); ++i) { + tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); + } + active_ = tmask_.any(); + + DPH(3, "*** Split: New TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, ", Pushed TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1)); + DPN(3, ", PC=0x" << std::hex << e.PC << "\n"); + } else { + DP(3, "*** Unanimous pred"); + DomStackEntry e(tmask_); + e.unanimous = true; + domStack_.push(e); + } + } break; + case 3: { + // JOIN + pipeline_state->gpu.type = GpuType::JOIN; + pipeline_state->stall_warp = true; + if (!domStack_.empty() && domStack_.top().unanimous) { + DP(3, "*** Uninimous branch at join"); + tmask_ = domStack_.top().tmask; + active_ = tmask_.any(); + domStack_.pop(); + } else { + if (!domStack_.top().fallThrough) { + nextPC = domStack_.top().PC; + DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec); + } + + tmask_ = domStack_.top().tmask; + active_ = tmask_.any(); + + DPH(3, "*** Join: New TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, "\n"); + + domStack_.pop(); + } + } break; + case 4: { + // BAR + pipeline_state->gpu.type = GpuType::BAR; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->stall_warp = true; + active_ = false; + core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); + } break; + case 6: { + // PREFETCH + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.type = LsuType::PREFETCH; + pipeline_state->used_iregs[rsrc0] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + int addr = rsdata[t][0]; + printf("*** PREFETCHED %d ***\n", addr); + } + } break; + default: + std::abort(); + } + } break; case VSET: { int VLEN = core_->arch().vsize() * 8; int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); @@ -928,7 +935,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); + DP(3, "Adding " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } @@ -940,7 +947,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); + DP(3, "Adding " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } @@ -952,7 +959,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); + DP(3, "Adding " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -968,7 +975,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -976,7 +983,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -984,7 +991,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -999,7 +1006,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1007,7 +1014,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1015,7 +1022,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -1030,7 +1037,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1038,7 +1045,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1046,7 +1053,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -1061,7 +1068,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int8_t first = *(int8_t *)(vr1.data() + i); int8_t second = *(int8_t *)(vr2.data() + i); int8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1069,7 +1076,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int16_t first = *(int16_t *)(vr1.data() + i); int16_t second = *(int16_t *)(vr2.data() + i); int16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1077,7 +1084,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int32_t first = *(int32_t *)(vr1.data() + i); int32_t second = *(int32_t *)(vr2.data() + i); int32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int32_t *)(vd.data() + i) = result; } } @@ -1092,7 +1099,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1100,7 +1107,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1108,7 +1115,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -1123,7 +1130,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int8_t first = *(int8_t *)(vr1.data() + i); int8_t second = *(int8_t *)(vr2.data() + i); int8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1131,7 +1138,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int16_t first = *(int16_t *)(vr1.data() + i); int16_t second = *(int16_t *)(vr2.data() + i); int16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1139,7 +1146,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int32_t first = *(int32_t *)(vr1.data() + i); int32_t second = *(int32_t *)(vr2.data() + i); int32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int32_t *)(vd.data() + i) = result; } } @@ -1154,7 +1161,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1162,7 +1169,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1170,7 +1177,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -1185,7 +1192,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int8_t first = *(int8_t *)(vr1.data() + i); int8_t second = *(int8_t *)(vr2.data() + i); int8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1193,7 +1200,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int16_t first = *(int16_t *)(vr1.data() + i); int16_t second = *(int16_t *)(vr2.data() + i); int16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1201,7 +1208,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int32_t first = *(int32_t *)(vr1.data() + i); int32_t second = *(int32_t *)(vr2.data() + i); int32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int32_t *)(vd.data() + i) = result; } } @@ -1222,7 +1229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1235,7 +1242,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1248,7 +1255,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1268,7 +1275,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1281,7 +1288,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1294,7 +1301,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1314,7 +1321,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1327,7 +1334,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1340,7 +1347,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1360,7 +1367,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1373,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1386,7 +1393,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1406,7 +1413,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1419,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1432,7 +1439,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1452,7 +1459,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1465,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1478,7 +1485,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1498,7 +1505,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1511,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1524,7 +1531,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1544,7 +1551,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1557,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1570,7 +1577,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1588,7 +1595,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1599,7 +1606,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1610,7 +1617,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1628,7 +1635,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { @@ -1639,7 +1646,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { @@ -1650,7 +1657,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { @@ -1669,7 +1676,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (rsdata[i][0] + second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1679,7 +1686,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (rsdata[i][0] + second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1689,7 +1696,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (rsdata[i][0] + second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1705,7 +1712,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (rsdata[i][0] * second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1715,7 +1722,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (rsdata[i][0] * second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1725,7 +1732,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (rsdata[i][0] * second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1741,7 +1748,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { vtype_.vsew = instr.getVsew(); vtype_.vlmul = instr.getVlmul(); - D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX); + DP(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX); int s0 = rsdata[0][0]; if (s0 <= VLMAX) { @@ -1762,46 +1769,49 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } if (rd_write) { + pipeline_state->wb = true; DPH(2, "Dest Reg: "); - int rdt = instr.getRDType(); + auto rdt = instr.getRDType(); switch (rdt) { - case 1: + case RegType::Integer: if (rdest) { - DPH(2, "r" << std::dec << rdest << "={"); + DPN(2, "r" << std::dec << rdest << "={"); for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - iRegFile_.at(t)[rdest] = rddata[t]; if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + iRegFile_.at(t)[rdest] = rddata[t]; DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); pipeline_state->used_iregs[rdest] = 1; } break; - case 2: - DPH(2, "fr" << std::dec << rdest << "={"); + case RegType::Float: + DPN(2, "fr" << std::dec << rdest << "={"); for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - fRegFile_.at(t)[rdest] = rddata[t]; if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + fRegFile_.at(t)[rdest] = rddata[t]; DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); pipeline_state->used_fregs[rdest] = 1; break; - case 3: - pipeline_state->used_vregs[rdest] = 1; - break; default: + std::abort(); break; } } PC_ += core_->arch().wsize(); if (PC_ != nextPC) { - D(3, "*** Next PC: " << std::hex << nextPC << std::dec); + DP(3, "*** Next PC: " << std::hex << nextPC << std::dec); PC_ = nextPC; } } diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp index 5cdf22f3..ba280812 100644 --- a/sim/simX/exeunit.cpp +++ b/sim/simX/exeunit.cpp @@ -9,6 +9,17 @@ using namespace vortex; +NopUnit::NopUnit(Core*) : ExeUnit("NOP") {} + +void NopUnit::step(uint64_t /*cycle*/) { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + this->schedule_output(state, 1); +} + +/////////////////////////////////////////////////////////////////////////////// + LsuUnit::LsuUnit(Core* core) : ExeUnit("LSU") , core_(core) @@ -17,61 +28,77 @@ LsuUnit::LsuUnit(Core* core) , fence_lock_(false) {} -void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) { - auto entry = pending_dcache_.at(response.tag); - entry.second.reset(port_id); // track remaining blocks - if (!entry.second.any()) { - auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); - entry.first.dcache_latency = latency; - this->schedule_output(entry.first, 1); - pending_dcache_.release(response.tag); - } -} +void LsuUnit::step(uint64_t cycle) { + __unused (cycle); + + // handle dcache response + for (uint32_t t = 0; t < num_threads_; ++t) { + MemRsp mem_rsp; + if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp)) + continue; + auto& entry = pending_dcache_.at(mem_rsp.tag); + DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first); + assert(entry.second.test(t)); + entry.second.reset(t); // track remaining blocks + if (!entry.second.any()) { + auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); + entry.first.dcache_latency = latency; + this->schedule_output(entry.first, 1); + pending_dcache_.release(mem_rsp.tag); + } + } -void LsuUnit::step() { if (fence_lock_) { // wait for all pending memory operations to complete if (!pending_dcache_.empty()) return; this->schedule_output(fence_state_, 1); fence_lock_ = false; + DT(3, cycle, "fence-unlock: " << fence_state_); } + // check input queue if (inputs_.empty()) return; auto state = inputs_.top(); - if (state.lsu.fence) { + if (state.lsu.type == LsuType::FENCE) { // schedule fence lock fence_state_ = state; fence_lock_ = true; inputs_.pop(); + DT(3, cycle, "fence-lock: " << state); return; } - // send dcache requests - if (!pending_dcache_.full()) { - state.dcache_latency = SimPlatform::instance().cycles(); - auto tag = pending_dcache_.allocate({state, state.tmask}); - for (uint32_t t = 0; t < num_threads_; ++t) { - if (!state.tmask.test(t)) - continue; - MemReq mem_req; - mem_req.addr = state.mem_addrs.at(t); - mem_req.write = state.lsu.store; - mem_req.tag = tag; - core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1); - } - inputs_.pop(); + // check pending queue capacity + if (pending_dcache_.full()) { + DT(3, cycle, "*** lsu-queue-stall: " << state); + return; } + + // send dcache request + state.dcache_latency = SimPlatform::instance().cycles(); + auto tag = pending_dcache_.allocate({state, state.tmask}); + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!state.tmask.test(t)) + continue; + MemReq mem_req; + mem_req.addr = state.mem_addrs.at(t); + mem_req.write = (state.lsu.type == LsuType::STORE); + mem_req.tag = tag; + core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1); + DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state); + } + inputs_.pop(); } /////////////////////////////////////////////////////////////////////////////// AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} -void AluUnit::step() { +void AluUnit::step(uint64_t /*cycle*/) { pipeline_state_t state; if (!inputs_.try_pop(&state)) return; @@ -95,7 +122,7 @@ void AluUnit::step() { CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} -void CsrUnit::step() { +void CsrUnit::step(uint64_t /*cycle*/) { pipeline_state_t state; if (!inputs_.try_pop(&state)) return; @@ -106,7 +133,7 @@ void CsrUnit::step() { FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} -void FpuUnit::step() { +void FpuUnit::step(uint64_t /*cycle*/) { pipeline_state_t state; if (!inputs_.try_pop(&state)) return; @@ -133,7 +160,7 @@ void FpuUnit::step() { GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {} -void GpuUnit::step() { +void GpuUnit::step(uint64_t /*cycle*/) { pipeline_state_t state; if (!inputs_.try_pop(&state)) return; diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h index 915089d3..3b2bbf91 100644 --- a/sim/simX/exeunit.h +++ b/sim/simX/exeunit.h @@ -43,7 +43,16 @@ public: return outputs_.try_pop(state); } - virtual void step() = 0; + virtual void step(uint64_t cycle) = 0; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class NopUnit : public ExeUnit { +public: + NopUnit(Core*); + + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -59,9 +68,7 @@ private: public: LsuUnit(Core*); - void handleCacheReponse(const MemRsp& response, uint32_t port_id); - - void step(); + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -70,7 +77,7 @@ class AluUnit : public ExeUnit { public: AluUnit(Core*); - void step(); + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -79,7 +86,7 @@ class CsrUnit : public ExeUnit { public: CsrUnit(Core*); - void step(); + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -88,7 +95,7 @@ class FpuUnit : public ExeUnit { public: FpuUnit(Core*); - void step(); + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -97,7 +104,7 @@ class GpuUnit : public ExeUnit { public: GpuUnit(Core*); - void step(); + void step(uint64_t cycle); }; } \ No newline at end of file diff --git a/sim/simX/instr.h b/sim/simX/instr.h index 1a205478..5deace6c 100644 --- a/sim/simX/instr.h +++ b/sim/simX/instr.h @@ -53,22 +53,23 @@ public: : opcode_(Opcode::NOP) , num_rsrcs_(0) , has_imm_(false) + , rdest_type_(RegType::None) , rdest_(0) , func3_(0) , func7_(0) { for (int i = 0; i < MAX_REG_SOURCES; ++i) { - rsrc_type_[i] = 0; + rsrc_type_[i] = RegType::None; } } /* Setters used to "craft" the instruction. */ void setOpcode(Opcode opcode) { opcode_ = opcode; } - void setDestReg(int destReg) { rdest_type_ = 1; rdest_ = destReg; } - void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = 1; rsrc_[num_rsrcs_++] = srcReg; } - void setDestFReg(int destReg) { rdest_type_ = 2; rdest_ = destReg; } - void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = 2; rsrc_[num_rsrcs_++] = srcReg; } - void setDestVReg(int destReg) { rdest_type_ = 3; rdest_ = destReg; } - void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = 3; rsrc_[num_rsrcs_++] = srcReg; } + void setDestReg(int destReg) { rdest_type_ = RegType::Integer; rdest_ = destReg; } + void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Integer; rsrc_[num_rsrcs_++] = srcReg; } + void setDestFReg(int destReg) { rdest_type_ = RegType::Float; rdest_ = destReg; } + void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; } + void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; } + void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; } void setFunc3(Word func3) { func3_ = func3; } void setFunc7(Word func7) { func7_ = func7; } void setImm(Word imm) { has_imm_ = true; imm_ = imm; } @@ -89,9 +90,9 @@ public: Word getFunc7() const { return func7_; } int getNRSrc() const { return num_rsrcs_; } int getRSrc(int i) const { return rsrc_[i]; } - int getRSType(int i) const { return rsrc_type_[i]; } + RegType getRSType(int i) const { return rsrc_type_[i]; } int getRDest() const { return rdest_; } - int getRDType() const { return rdest_type_; } + RegType getRDType() const { return rdest_type_; } bool hasImm() const { return has_imm_; } Word getImm() const { return imm_; } Word getVlsWidth() const { return vlsWidth_; } @@ -112,15 +113,15 @@ private: Opcode opcode_; int num_rsrcs_; bool has_imm_; - int rdest_type_; + RegType rdest_type_; Word imm_; - int rsrc_type_[MAX_REG_SOURCES]; + RegType rsrc_type_[MAX_REG_SOURCES]; int rsrc_[MAX_REG_SOURCES]; int rdest_; Word func3_; Word func6_; - //Vector + // Vector Word vmask_; Word vlsWidth_; Word vMop_; diff --git a/sim/simX/main.cpp b/sim/simX/main.cpp index a34ada0e..a0e07faf 100644 --- a/sim/simX/main.cpp +++ b/sim/simX/main.cpp @@ -6,12 +6,15 @@ #include #include #include "processor.h" +#include #include "args.h" +#define RAM_PAGE_SIZE 4096 + using namespace vortex; int main(int argc, char **argv) { - int ret; + int exitcode; std::string archStr("rv32imf"); std::string imgFileName; @@ -53,11 +56,42 @@ int main(int argc, char **argv) { { ArchDef arch(archStr, num_cores, num_warps, num_threads); + Processor processor(arch); - ret = processor.run(imgFileName, riscv_test, showStats); + + RAM ram(RAM_PAGE_SIZE); + + { + std::string program_ext(fileExtension(imgFileName.c_str())); + if (program_ext == "bin") { + ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR); + } else if (program_ext == "hex") { + ram.loadHexImage(imgFileName.c_str()); + } else { + std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; + return -1; + } + } + + processor.attach_ram(&ram); + + exitcode = processor.run(); + + if (riscv_test) { + if (1 == exitcode) { + std::cout << "Passed." << std::endl; + exitcode = 0; + } else { + std::cout << "Failed." << std::endl; + } + } else { + if (exitcode != 0) { + std::cout << "*** error: exitcode=" << exitcode << std::endl; + } + } } SimPlatform::instance().finalize(); - return ret; + return exitcode; } diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp index c377972d..63ba571a 100644 --- a/sim/simX/memsim.cpp +++ b/sim/simX/memsim.cpp @@ -8,32 +8,26 @@ using namespace vortex; class MemSim::Impl { private: MemSim* simobject_; - std::vector> inputs_; + uint32_t num_banks_; uint32_t latency_; public: Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) : simobject_(simobject) - , inputs_(num_banks) + , num_banks_(num_banks) , latency_(latency) {} - void handleMemRequest(const MemReq& mem_req, uint32_t port_id) { - inputs_.at(port_id).push(mem_req); - } - void step(uint64_t /*cycle*/) { - for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) { - auto& queue = inputs_.at(i); - if (queue.empty()) + for (uint32_t i = 0, n = num_banks_; i < n; ++i) { + MemReq mem_req; + if (!simobject_->MemReqPorts.at(i).read(&mem_req)) continue; - auto& entry = queue.front(); - if (!entry.write) { + if (!mem_req.write) { MemRsp mem_rsp; - mem_rsp.tag = entry.tag; + mem_rsp.tag = mem_req.tag; simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); } - queue.pop(); } } }; @@ -45,7 +39,7 @@ MemSim::MemSim(const SimContext& ctx, uint32_t latency) : SimObject(ctx, "MemSim") , impl_(new Impl(this, num_banks, latency)) - , MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest}) + , MemReqPorts(num_banks, this) , MemRspPorts(num_banks, this) {} diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h index 82735c2a..b5937b29 100644 --- a/sim/simX/pipeline.h +++ b/sim/simX/pipeline.h @@ -10,14 +10,19 @@ namespace vortex { struct pipeline_state_t { - //-- + //-- + uint64_t id; + + //-- + int cid; int wid; ThreadMask tmask; Word PC; //-- bool stall_warp; - int rdest_type; + bool wb; + RegType rdest_type; int rdest; RegMask used_iregs; RegMask used_fregs; @@ -30,10 +35,7 @@ struct pipeline_state_t { //-- union { struct { - uint8_t load : 1; - uint8_t store: 1; - uint8_t fence : 1; - uint8_t prefetch: 1; + LsuType type; } lsu; struct { AluType type; @@ -49,8 +51,37 @@ struct pipeline_state_t { // stats uint64_t icache_latency; uint64_t dcache_latency; + + void clear() { + cid = 0; + wid = 0; + tmask.reset(); + PC = 0; + stall_warp = false; + wb = false; + rdest = 0; + rdest_type = RegType::None; + used_iregs.reset(); + used_fregs.reset(); + used_vregs.reset(); + exe_type = ExeType::NOP; + mem_addrs.clear(); + icache_latency = 0; + dcache_latency = 0; + } }; +inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { + os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC; + os << ", wb=" << state.wb; + if (state.wb) { + os << ", rd=" << state.rdest_type << std::dec << state.rdest; + } + os << ", ex=" << state.exe_type; + os << " (#" << std::dec << state.id << ")"; + return os; +} + class PipelineStage : public Queue { protected: const char* name_; @@ -62,15 +93,4 @@ public: {} }; -inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { - os << "stall_warp=" << state.stall_warp; - os << ", wid=" << state.wid; - os << ", PC=" << std::hex << state.PC; - os << ", used_iregs=" << state.used_iregs; - os << ", used_fregs=" << state.used_fregs; - os << ", used_vregs=" << state.used_vregs; - os << std::endl; - return os; -} - } \ No newline at end of file diff --git a/sim/simX/processor.cpp b/sim/simX/processor.cpp new file mode 100644 index 00000000..be5cd4f4 --- /dev/null +++ b/sim/simX/processor.cpp @@ -0,0 +1,141 @@ +#include "processor.h" +#include "constants.h" + +using namespace vortex; + +Processor::Processor(const ArchDef& arch) + : cores_(arch.num_cores()) + , l2caches_(NUM_CLUSTERS) + , l2_mem_switches_(NUM_CLUSTERS) +{ + uint32_t num_cores = arch.num_cores(); + uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; + + // create cores + for (uint32_t i = 0; i < num_cores; ++i) { + cores_.at(i) = Core::Create(arch, i); + } + + // connect memory sub-systen + memsim_ = MemSim::Create(1, MEM_LATENCY); + std::vector*> mem_req_ports(1); + std::vector*> mem_rsp_ports(1); + mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); + mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); + + if (L3_ENABLE) { + l3cache_ = Cache::Create("l3cache", CacheConfig{ + log2ceil(L3_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L3_NUM_BANKS, // number of banks + L3_NUM_PORTS, // number of ports + NUM_CLUSTERS, // request size + true, // write-throught + 0, // victim size + L3_MSHR_SIZE, // mshr + 2, // pipeline latency + } + ); + + mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); + l3cache_->MemReqPort.bind(mem_req_ports.at(0)); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); + mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); + } + } else if (NUM_CLUSTERS > 1) { + l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); + mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); + l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); + mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); + } + } + + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + if (L2_ENABLE) { + auto& l2cache = l2caches_.at(i); + l2cache = Cache::Create("l2cache", CacheConfig{ + log2ceil(L2_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L2_NUM_BANKS, // number of banks + L2_NUM_PORTS, // number of ports + NUM_CORES, // request size + true, // write-throught + 0, // victim size + L2_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); + l2cache->MemReqPort.bind(mem_req_ports.at(i)); + + mem_req_ports.resize(cores_per_cluster); + mem_rsp_ports.resize(cores_per_cluster); + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); + mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + } + } else if (cores_per_cluster > 1) { + auto& l2_mem_switch = l2_mem_switches_.at(i); + l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES); + mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); + l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + + mem_req_ports.resize(cores_per_cluster); + mem_rsp_ports.resize(cores_per_cluster); + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); + mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + } + } + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + auto& core = cores_.at((i * NUM_CLUSTERS) + j); + mem_rsp_ports.at(i)->bind(&core->MemRspPort); + core->MemReqPort.bind(mem_req_ports.at(j)); + } + } +} + +void Processor::attach_ram(RAM* ram) { + for (auto core : cores_) { + core->attach_ram(ram); + } +} + +Processor::~Processor() {} + +int Processor::run() { + bool running; + int exitcode = 0; + do { + SimPlatform::instance().step(); + + running = false; + for (auto& core : cores_) { + if (core->running()) { + running = true; + } + if (core->check_ebreak()) { + exitcode = core->getIRegValue(3); + running = false; + break; + } + } + } while (running); + + return exitcode; +} \ No newline at end of file diff --git a/sim/simX/processor.h b/sim/simX/processor.h index 50671953..e41fd740 100644 --- a/sim/simX/processor.h +++ b/sim/simX/processor.h @@ -1,189 +1,27 @@ #pragma once -#include "constants.h" -#include "debug.h" -#include "types.h" #include "core.h" namespace vortex { class Processor { +public: + typedef std::shared_ptr Ptr; + + Processor(const ArchDef& arch); + ~Processor(); + + void attach_ram(RAM* mem); + + int run(); + private: - ArchDef arch_; - Decoder decoder_; - MemoryUnit mu_; - RAM ram_; std::vector cores_; std::vector l2caches_; std::vector::Ptr> l2_mem_switches_; Cache::Ptr l3cache_; Switch::Ptr l3_mem_switch_; MemSim::Ptr memsim_; - -public: - Processor(const ArchDef& arch) - : arch_(arch) - , decoder_(arch) - , mu_(0, arch.wsize(), true) - , ram_((1<<12), (1<<20)) - , cores_(arch.num_cores()) - , l2caches_(NUM_CLUSTERS) - , l2_mem_switches_(NUM_CLUSTERS) - { - uint32_t num_cores = arch.num_cores(); - uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; - - // bind RAM to memory unit - mu_.attach(ram_, 0, 0xFFFFFFFF); - - // create cores - for (uint32_t i = 0; i < num_cores; ++i) { - cores_.at(i) = Core::Create(arch, decoder_, mu_, i); - } - - // connect memory sub-systen - memsim_ = MemSim::Create(1, MEM_LATENCY); - std::vector*> mem_req_ports(1); - std::vector*> mem_rsp_ports(1); - mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); - mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); - - if (L3_ENABLE) { - l3cache_ = Cache::Create("l3cache", CacheConfig{ - log2ceil(L3_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B - 2, // W - 0, // A - 32, // address bits - L3_NUM_BANKS, // number of banks - L3_NUM_PORTS, // number of ports - NUM_CLUSTERS, // request size - true, // write-throught - 0, // victim size - L3_MSHR_SIZE, // mshr - 2, // pipeline latency - }); - mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); - l3cache_->MemReqPort.bind(mem_req_ports.at(0)); - - mem_req_ports.resize(NUM_CLUSTERS); - mem_rsp_ports.resize(NUM_CLUSTERS); - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); - mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); - } - } else if (NUM_CLUSTERS > 1) { - l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); - mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); - l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); - - mem_req_ports.resize(NUM_CLUSTERS); - mem_rsp_ports.resize(NUM_CLUSTERS); - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); - mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); - } - } - - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - if (L2_ENABLE) { - auto& l2cache = l2caches_.at(i); - l2cache = Cache::Create("l2cache", CacheConfig{ - log2ceil(L2_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B - 2, // W - 0, // A - 32, // address bits - L2_NUM_BANKS, // number of banks - L2_NUM_PORTS, // number of ports - NUM_CORES, // request size - true, // write-throught - 0, // victim size - L2_MSHR_SIZE, // mshr - 2, // pipeline latency - }); - mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); - l2cache->MemReqPort.bind(mem_req_ports.at(i)); - - mem_req_ports.resize(cores_per_cluster); - mem_rsp_ports.resize(cores_per_cluster); - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); - mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); - } - } else if (cores_per_cluster > 1) { - auto& l2_mem_switch = l2_mem_switches_.at(i); - l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES); - mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); - l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); - - mem_req_ports.resize(cores_per_cluster); - mem_rsp_ports.resize(cores_per_cluster); - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); - mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); - } - } - - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - auto& core = cores_.at((i * NUM_CLUSTERS) + j); - mem_rsp_ports.at(i)->bind(&core->MemRspPort); - core->MemReqPort.bind(mem_req_ports.at(j)); - } - } - } - - ~Processor() {} - - int run(const std::string& program, bool riscv_test, bool /*showStats*/) { - { - std::string program_ext(fileExtension(program.c_str())); - if (program_ext == "bin") { - ram_.loadBinImage(program.c_str(), STARTUP_ADDR); - } else if (program_ext == "hex") { - ram_.loadHexImage(program.c_str()); - } else { - std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; - return -1; - } - } - - bool running; - int exitcode = 0; - do { - SimPlatform::instance().step(); - - running = false; - for (auto& core : cores_) { - if (core->running()) { - running = true; - } - if (core->check_ebreak()) { - exitcode = core->getIRegValue(3); - running = false; - break; - } - } - } while (running); - - // get error status - - if (riscv_test) { - if (1 == exitcode) { - std::cout << "Passed." << std::endl; - exitcode = 0; - } else { - std::cout << "Failed." << std::endl; - } - } else { - if (exitcode != 0) { - std::cout << "*** error: exitcode=" << exitcode << std::endl; - } - } - - return exitcode; - } - }; } \ No newline at end of file diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h index 0e0e0577..46bf3bdc 100644 --- a/sim/simX/scoreboard.h +++ b/sim/simX/scoreboard.h @@ -10,6 +10,7 @@ private: std::vector in_use_iregs_; std::vector in_use_fregs_; std::vector in_use_vregs_; + std::unordered_map owners_; public: Scoreboard(const ArchDef &arch) @@ -29,42 +30,87 @@ public: || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0 || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0; } + + std::vector owners(const pipeline_state_t& state) const { + std::vector out; + { + uint32_t r = 0; + auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid); + while (used_iregs.any()) { + if (used_iregs.test(0)) { + uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer; + out.push_back(owners_.at(tag)); + } + used_iregs >>= 1; + ++r; + } + } + { + uint32_t r = 0; + auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid); + while (used_fregs.any()) { + if (used_fregs.test(0)) { + uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float; + out.push_back(owners_.at(tag)); + } + used_fregs >>= 1; + ++r; + } + } + { + uint32_t r = 0; + auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid); + while (used_vregs.any()) { + if (used_vregs.test(0)) { + uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector; + out.push_back(owners_.at(tag)); + } + used_vregs >>= 1; + ++r; + } + } + return std::move(out); + } void reserve(const pipeline_state_t& state) { - if (!state.rdest) - return; - + if (!state.wb) + return; switch (state.rdest_type) { - case 1: + case RegType::Integer: in_use_iregs_.at(state.wid).set(state.rdest); break; - case 2: + case RegType::Float: in_use_fregs_.at(state.wid).set(state.rdest); break; - case 3: + case RegType::Vector: in_use_vregs_.at(state.wid).set(state.rdest); break; default: break; - } + } + uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; + assert(owners_.count(tag) == 0); + owners_[tag] = state.id; } void release(const pipeline_state_t& state) { - if (!state.rdest) - return; + if (!state.wb) + return; switch (state.rdest_type) { - case 1: + case RegType::Integer: in_use_iregs_.at(state.wid).reset(state.rdest); break; - case 2: + case RegType::Float: in_use_fregs_.at(state.wid).reset(state.rdest); break; - case 3: + case RegType::Vector: in_use_vregs_.at(state.wid).reset(state.rdest); break; default: break; } + uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; + owners_.erase(tag); } }; diff --git a/sim/simX/types.h b/sim/simX/types.h index 3dabfe3e..f53c3754 100644 --- a/sim/simX/types.h +++ b/sim/simX/types.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -20,7 +21,25 @@ typedef std::bitset<32> RegMask; typedef std::bitset<32> ThreadMask; typedef std::bitset<32> WarpMask; +enum class RegType { + None, + Integer, + Float, + Vector +}; + +inline std::ostream &operator<<(std::ostream &os, const RegType& type) { + switch (type) { + case RegType::None: break; + case RegType::Integer: os << "r"; break; + case RegType::Float: os << "fr"; break; + case RegType::Vector: os << "vr"; break; + } + return os; +} + enum class ExeType { + NOP, ALU, LSU, CSR, @@ -29,6 +48,19 @@ enum class ExeType { MAX, }; +inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { + switch (type) { + case ExeType::NOP: os << "NOP"; break; + case ExeType::ALU: os << "ALU"; break; + case ExeType::LSU: os << "LSU"; break; + case ExeType::CSR: os << "CSR"; break; + case ExeType::FPU: os << "FPU"; break; + case ExeType::GPU: os << "GPU"; break; + case ExeType::MAX: break; + } + return os; +} + enum class AluType { ARITH, BRANCH, @@ -36,6 +68,33 @@ enum class AluType { IDIV, }; +inline std::ostream &operator<<(std::ostream &os, const AluType& type) { + switch (type) { + case AluType::ARITH: os << "ARITH"; break; + case AluType::BRANCH: os << "BRANCH"; break; + case AluType::IMUL: os << "IMUL"; break; + case AluType::IDIV: os << "IDIV"; break; + } + return os; +} + +enum class LsuType { + LOAD, + STORE, + FENCE, + PREFETCH, +}; + +inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { + switch (type) { + case LsuType::LOAD: os << "LOAD"; break; + case LsuType::STORE: os << "STORE"; break; + case LsuType::FENCE: os << "FENCE"; break; + case LsuType::PREFETCH: os << "PREFETCH"; break; + } + return os; +} + enum class FpuType { FNCP, FMA, @@ -44,6 +103,17 @@ enum class FpuType { FCVT, }; +inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { + switch (type) { + case FpuType::FNCP: os << "FNCP"; break; + case FpuType::FMA: os << "FMA"; break; + case FpuType::FDIV: os << "FDIV"; break; + case FpuType::FSQRT: os << "FSQRT"; break; + case FpuType::FCVT: os << "FCVT"; break; + } + return os; +} + enum class GpuType { TMC, WSPAWN, @@ -53,11 +123,31 @@ enum class GpuType { TEX, }; +inline std::ostream &operator<<(std::ostream &os, const GpuType& type) { + switch (type) { + case GpuType::TMC: os << "TMC"; break; + case GpuType::WSPAWN: os << "WSPAWN"; break; + case GpuType::SPLIT: os << "SPLIT"; break; + case GpuType::JOIN: os << "JOIN"; break; + case GpuType::BAR: os << "BAR"; break; + case GpuType::TEX: os << "TEX"; break; + } + return os; +} + enum class ArbiterType { Priority, RoundRobin }; +inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) { + switch (type) { + case ArbiterType::Priority: os << "Priority"; break; + case ArbiterType::RoundRobin: os << "RoundRobin"; break; + } + return os; +} + /////////////////////////////////////////////////////////////////////////////// template @@ -65,6 +155,8 @@ class Queue { protected: std::queue queue_; + uint32_t count; + public: Queue() {} @@ -77,6 +169,7 @@ public: } void push(const T& value) { + ++count; queue_.push(value); } @@ -141,6 +234,7 @@ public: return i; } } + assert(false); return -1; } @@ -148,6 +242,7 @@ public: auto& entry = entries_.at(index); assert(entry.first); entry.first = false; + --capacity_; } void remove(uint32_t index, T* value) { @@ -155,6 +250,7 @@ public: assert(entry.first); *value = entry.second; entry.first = false; + --capacity_; } }; @@ -163,29 +259,21 @@ public: template class Switch : public SimObject> { private: - struct req_t { + struct req_batch_t { std::vector data; std::bitset valid; - req_t() {} - req_t(uint32_t size) : data(size) {} + req_batch_t() {} + req_batch_t(uint32_t size) + : data(size) + , valid(0) + {} }; - void handleIncomingRequest(const Req& req, uint32_t port_id) { - cur_req_.data.at(port_id) = req; - cur_req_.valid.set(port_id); - } - - void handleIncomingResponse(const Rsp& rsp, uint32_t) { - rsps_.push(rsp); - } - ArbiterType type_; - std::queue reqs_; - std::queue rsps_; - req_t cur_req_; + std::queue reqq_; uint32_t delay_; uint32_t cursor_; - std::unordered_map addr_table_; + uint32_t tag_shift_; public: Switch( @@ -197,12 +285,12 @@ public: ) : SimObject>(ctx, name) , type_(type) - , cur_req_(num_inputs) , delay_(delay) , cursor_(0) - , ReqIn(num_inputs, {this, this, &Switch::handleIncomingRequest}) + , tag_shift_(log2ceil(num_inputs)) + , ReqIn(num_inputs, this) , ReqOut(this) - , RspIn(this, this, &Switch::handleIncomingResponse) + , RspIn(this) , RspOut(num_inputs, this) { assert(delay_ != 0); @@ -210,36 +298,52 @@ public: } void step(uint64_t /*cycle*/) { - if (cur_req_.valid.any()) { - reqs_.push(cur_req_); - cur_req_.valid.reset(); - } - - while (!reqs_.empty()) { - auto& entry = reqs_.front(); - bool found = false; - for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) { - auto j = (cursor_ + i) % n; - if (entry.valid.test(j)) { - auto& req = entry.data.at(j); - addr_table_[req.tag] = j; - ReqOut.send(req, delay_); - entry.valid.reset(j); - this->update_cursor(j); - found = true; - break; + // process incomming requests + { + req_batch_t req_batch(ReqIn.size()); + for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) { + Req req; + if (ReqIn.at(i).read(&req)) { + req_batch.data.at(i) = req; + req_batch.valid.set(i); } } - if (found) - break; - reqs_.pop(); + if (req_batch.valid.any()) { + reqq_.push(req_batch); + } + } + + // apply arbitration + if (!reqq_.empty()) { + auto& req_batch = reqq_.front(); + for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) { + auto j = (cursor_ + i) % n; + if (req_batch.valid.test(j)) { + auto& req = req_batch.data.at(j); + if (tag_shift_) { + req.tag = (req.tag << tag_shift_) | j; + } + ReqOut.send(req, delay_); + req_batch.valid.reset(j); + this->update_cursor(j); + if (!req_batch.valid.any()) + reqq_.pop(); // pop when empty + break; + } + } } - if (!rsps_.empty()) { - auto& rsp = rsps_.front(); - auto port_id = addr_table_.at(rsp.tag); - RspOut.at(port_id).send(rsp, 1); - rsps_.pop(); + // process incoming reponses + { + Rsp rsp; + if (RspIn.read(&rsp)) { + uint32_t port_id = 0; + if (tag_shift_) { + port_id = rsp.tag & ((1 << tag_shift_)-1); + rsp.tag >>= tag_shift_; + } + RspOut.at(port_id).send(rsp, 1); + } } } diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp index 0c989d0c..89b9cc39 100644 --- a/sim/simX/warp.cpp +++ b/sim/simX/warp.cpp @@ -24,30 +24,34 @@ Warp::Warp(Core *core, Word id) void Warp::eval(pipeline_state_t *pipeline_state) { assert(tmask_.any()); - DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask="); + DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask="); for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) DPN(2, tmask_.test(n-i-1)); - DPN(2, "\n"); + DPN(2, ", PC=0x" << std::hex << PC_ << std::endl); /* Fetch and decode. */ - Word fetched = core_->icache_fetch(PC_); - auto instr = core_->decoder().decode(fetched, PC_); + Word instr_code = core_->icache_read(PC_, sizeof(Word)); + auto instr = core_->decoder().decode(instr_code); + if (!instr) { + std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl; + std::abort(); + } + + DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr); // Update state + pipeline_state->cid = core_->id(); pipeline_state->wid = id_; pipeline_state->PC = PC_; pipeline_state->tmask = tmask_; pipeline_state->rdest = instr->getRDest(); pipeline_state->rdest_type = instr->getRDType(); - pipeline_state->used_iregs.reset(); - pipeline_state->used_fregs.reset(); - pipeline_state->used_vregs.reset(); - + // Execute this->execute(*instr, pipeline_state); - D(4, "Register state:"); + DP(4, "Register state:"); for (int i = 0; i < core_->arch().num_regs(); ++i) { DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); for (int j = 0; j < core_->arch().num_threads(); ++j) { diff --git a/sim/vlsim/opae_sim.cpp b/sim/vlsim/opae_sim.cpp index ced1e233..5da617b5 100644 --- a/sim/vlsim/opae_sim.cpp +++ b/sim/vlsim/opae_sim.cpp @@ -44,6 +44,8 @@ #define VERILATOR_RESET_VALUE 2 #endif +#define RAM_PAGE_SIZE 4096 + using namespace vortex; static uint64_t timestamp = 0; @@ -136,7 +138,7 @@ opae_sim::opae_sim() : stop_(false) , host_buffer_ids_(0) { vl_obj_ = new VL_OBJ(); - ram_ = new RAM((1<<12), (1<<20)); + ram_ = new RAM(RAM_PAGE_SIZE); // reset the device this->reset();