From c2721fd545a74e7e3b29f1a79c8e0bfd2c8fa8f3 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 13 Nov 2021 01:41:12 -0500 Subject: [PATCH] SimX timing simulation --- sim/common/simobject.h | 427 +++++++ sim/common/util.h | 76 +- sim/simX/Makefile | 2 +- sim/simX/archdef.h | 45 +- sim/simX/cache.cpp | 497 ++++++++ sim/simX/cache.h | 40 + sim/simX/constants.h | 21 + sim/simX/core.cpp | 392 ++++--- sim/simX/core.h | 96 +- sim/simX/decode.cpp | 12 +- sim/simX/decode.h | 2 +- sim/simX/execute.cpp | 2517 ++++++++++++++++++++++------------------ sim/simX/exeunit.cpp | 152 +++ sim/simX/exeunit.h | 103 ++ sim/simX/ibuffer.h | 39 + sim/simX/instr.h | 7 +- sim/simX/main.cpp | 76 +- sim/simX/memsim.cpp | 58 + sim/simX/memsim.h | 35 + sim/simX/pipeline.cpp | 63 - sim/simX/pipeline.h | 86 +- sim/simX/processor.h | 189 +++ sim/simX/scoreboard.h | 71 ++ sim/simX/types.h | 240 +++- sim/simX/warp.cpp | 69 +- sim/simX/warp.h | 14 +- 26 files changed, 3690 insertions(+), 1639 deletions(-) create mode 100644 sim/common/simobject.h create mode 100644 sim/simX/cache.cpp create mode 100644 sim/simX/cache.h create mode 100644 sim/simX/constants.h create mode 100644 sim/simX/exeunit.cpp create mode 100644 sim/simX/exeunit.h create mode 100644 sim/simX/ibuffer.h create mode 100644 sim/simX/memsim.cpp create mode 100644 sim/simX/memsim.h delete mode 100644 sim/simX/pipeline.cpp create mode 100644 sim/simX/processor.h create mode 100644 sim/simX/scoreboard.h diff --git a/sim/common/simobject.h b/sim/common/simobject.h new file mode 100644 index 00000000..68bccc87 --- /dev/null +++ b/sim/common/simobject.h @@ -0,0 +1,427 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace vortex { + +class SimObjectBase; + +class SimEventBase { +public: + typedef std::shared_ptr Ptr; + + virtual ~SimEventBase() {} + + virtual void fire() const = 0; + + bool step() { + return (0 == --delay_); + } + +protected: + SimEventBase(uint64_t delay) : delay_(delay) {} + + uint64_t delay_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimSimpleEvent : public SimEventBase { +public: + typedef std::function Func; + + template + static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) { + return std::make_shared(func, pkt, delay); + } + + SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay) + : SimEventBase(delay) + , func_(func) + , pkt_(pkt) + {} + + void fire() const override { + func_(pkt_); + } + +protected: + Func func_; + Pkt pkt_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimPortEvent : public SimEventBase { +public: + typedef std::function Func; + + template + static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) { + return std::make_shared(func, pkt, port_id, delay); + } + + SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) + : SimEventBase(delay) + , func_(func) + , pkt_(pkt) + , port_id_(port_id) + {} + + void fire() const override { + func_(pkt_, port_id_); + } + +private: + Func func_; + Pkt pkt_; + uint32_t port_id_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimPortBase { +public: + typedef std::shared_ptr Ptr; + + virtual ~SimPortBase() {} + + SimObjectBase* module() const { + return module_; + } + + uint32_t port_id() const { + return port_id_; + } + + SimPortBase* peer() const { + return peer_; + } + + bool connected() const { + return (peer_ != nullptr); + } + + bool is_slave() const { + return is_slave_; + } + +protected: + + SimPortBase(SimObjectBase* module, bool is_slave); + + void connect(SimPortBase* peer) { + assert(peer_ == nullptr); + peer_ = peer; + } + + void disconnect() { + assert(peer_ == nullptr); + peer_ = nullptr; + } + + SimObjectBase* module_; + uint32_t port_id_; + bool is_slave_; + SimPortBase* peer_; + + template friend class MasterPort; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SlavePort : public SimPortBase { +public: + typedef std::shared_ptr> Ptr; + typedef std::function Func; + + static Ptr Create(SimObjectBase* module, const Func& func) { + return std::make_shared>(module, func); + } + + template + static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) { + return std::make_shared>(module, obj, entry); + } + + SlavePort(SimObjectBase* module, const Func& func) + : SimPortBase(module, true) + , func_(func) + {} + + template + SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) + : SimPortBase(module, true) + , func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2)) + {} + + SlavePort(SimObjectBase* module, SlavePort* peer) + : SimPortBase(module, false) + { + this->connect(peer); + } + + void send(const Pkt& pkt, uint64_t delay) const; + + const Func& func() const { + return func_; + } + +protected: + SlavePort& operator=(const SlavePort&); + Func func_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class MasterPort : public SimPortBase { +public: + typedef std::shared_ptr> Ptr; + typedef std::function Func; + + static Ptr Create() { + return std::make_shared>(module); + } + + MasterPort(SimObjectBase* module) : SimPortBase(module, false) {} + + MasterPort(SimObjectBase* module, MasterPort* peer) + : SimPortBase(module, false) + { + peer->connect(this); + } + + void bind(SlavePort* peer) { + this->connect(peer); + } + + void unbind() { + peer_->disconnect(); + this->disconnect(); + } + + void send(const Pkt& pkt, uint64_t delay) const { + assert(peer_ != nullptr); + if (peer_->is_slave()) { + auto slave = reinterpret_cast*>(peer_); + slave->send(pkt, delay); + } else { + auto master = reinterpret_cast*>(peer_); + master->send(pkt, delay); + } + } + +private: + MasterPort& operator=(const MasterPort&); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimContext; + +class SimObjectBase { +public: + typedef std::shared_ptr Ptr; + + virtual ~SimObjectBase() {} + + template + void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay); + + virtual void step(uint64_t cycle) = 0; + + const std::string& name() const { + return name_; + } + +protected: + + SimObjectBase(const SimContext& ctx, const char* name); + + uint32_t allocate_port(SimPortBase* port) { + uint32_t id = ports_.size(); + ports_.push_back(port); + return id; + } + +private: + std::string name_; + std::vector ports_; + + friend class SimPlatform; + friend class SimPortBase; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimObject : public SimObjectBase { +public: + typedef std::shared_ptr Ptr; + + template + static Ptr Create(Args&&... args); + +protected: + + SimObject(const SimContext& ctx, const char* name) : SimObjectBase(ctx, name) {} + + void step(uint64_t cycle) override { + this->impl().step(cycle); + } + +private: + + const Impl& impl() const { + return static_cast(*this); + } + + Impl& impl() { + return static_cast(*this); + } +}; + +class SimContext { +private: + SimContext() {} + template template + friend typename SimObject::Ptr SimObject::Create(Args&&... args); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimPlatform { +public: + static SimPlatform& instance() { + static SimPlatform s_inst; + return s_inst; + } + + bool initialize() { + //-- + return true; + } + + void finalize() { + instance().clear(); + } + + void register_object(const SimObjectBase::Ptr& obj) { + objects_.push_back(obj); + } + + template + void schedule(const typename SimSimpleEvent::Func& callback, + const Pkt& pkt, + uint64_t delay) { + auto evt = SimSimpleEvent::Create(callback, pkt, delay); + assert(delay != 0); + events_.emplace_back(evt); + } + + template + void schedule(const typename SimPortEvent::Func& callback, + const Pkt& pkt, + uint32_t port_id, + uint64_t delay) { + auto evt = SimPortEvent::Create(callback, pkt, port_id, delay); + assert(delay != 0); + events_.emplace_back(evt); + } + + void step() { + // evaluate events + auto evt_it = events_.begin(); + auto evt_it_end = events_.end(); + while (evt_it != evt_it_end) { + auto& event = *evt_it; + if (event->step()) { + event->fire(); + evt_it = events_.erase(evt_it); + } else { + ++evt_it; + } + } + // evaluate components + for (auto& object : objects_) { + object->step(cycles_); + } + // advance clock + ++cycles_; + } + + uint64_t cycles() const { + return cycles_; + } + +private: + + SimPlatform() : cycles_(0) {} + + virtual ~SimPlatform() { + this->clear(); + } + + void clear() { + objects_.clear(); + events_.clear(); + } + + std::vector objects_; + std::list events_; + uint64_t cycles_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave) + : module_(module) + , port_id_(module->allocate_port(this)) + , is_slave_(is_slave) + , peer_(nullptr) +{} + +inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) + : name_(name) +{} + +template +template +typename SimObject::Ptr SimObject::Create(Args&&... args) { + auto obj = std::make_shared(SimContext{}, std::forward(args)...); + SimPlatform::instance().register_object(obj); + return obj; +} + +template +void SlavePort::send(const Pkt& pkt, uint64_t delay) const { + if (func_) { + SimPlatform::instance().schedule(func_, pkt, port_id_, delay); + } else { + assert(peer_ != nullptr); + if (peer_->is_slave()) { + auto slave = reinterpret_cast*>(peer_); + slave->send(pkt, delay); + } else { + auto master = reinterpret_cast*>(peer_); + master->send(pkt, delay); + } + } +} + +template +void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) { + auto callback = std::bind(entry, obj, std::placeholders::_1); + SimPlatform::instance().schedule(callback, pkt, delay); +} + +} \ No newline at end of file diff --git a/sim/common/util.h b/sim/common/util.h index dbaeb5fa..668f3e26 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include template @@ -8,24 +9,83 @@ void unused(Args&&...) {} #define __unused(...) unused(__VA_ARGS__) -constexpr bool ispow2(uint64_t value) { +constexpr uint32_t count_leading_zeros(uint32_t value) { + return value ? __builtin_clz(value) : 32; +} + +constexpr uint32_t count_trailing_zeros(uint32_t value) { + return value ? __builtin_ctz(value) : 32; +} + +constexpr bool ispow2(uint32_t value) { return value && !(value & (value - 1)); } -constexpr unsigned log2ceil(uint32_t value) { - return 32 - __builtin_clz(value - 1); +constexpr uint32_t log2ceil(uint32_t value) { + return 32 - count_leading_zeros(value - 1); } -inline uint64_t align_size(uint64_t size, uint64_t alignment) { +inline unsigned log2up(uint32_t value) { + return std::max(1, log2ceil(value)); +} + +constexpr unsigned log2floor(uint32_t value) { + return 31 - count_leading_zeros(value); +} + +constexpr unsigned ceil2(uint32_t value) { + return 32 - count_leading_zeros(value); +} + +inline uint64_t bit_clr(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits & ~(1ull << index); +} + +inline uint64_t bit_set(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits | (1ull << index); +} + +inline bool bit_get(uint64_t bits, uint32_t index) { + assert(index <= 63); + return (bits >> index) & 0x1; +} + +inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift; + return bits & ~mask; +} + +inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t dirty = (value << (shift + start)) >> shift; + return bit_clrw(bits, start, end) | dirty; +} + +inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + return (bits << shift) >> (shift + start); +} + +inline uint64_t aligned_size(uint64_t size, uint32_t alignment) { assert(0 == (alignment & (alignment - 1))); return (size + alignment - 1) & ~(alignment - 1); } // Apply integer sign extension -inline uint32_t signExt(uint32_t w, uint32_t bit, uint32_t mask) { - if (w >> (bit - 1)) - w |= ~mask; - return w; +inline uint32_t sext32(uint32_t word, uint32_t width) { + assert(width > 1); + assert(width <= 32); + uint32_t mask = (1 << width) - 1; + return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word; } // return file extension diff --git a/sim/simX/Makefile b/sim/simX/Makefile index 29b53fc3..e42464c6 100644 --- a/sim/simX/Makefile +++ b/sim/simX/Makefile @@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a TOP = vx_cache_sim SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += args.cpp pipeline.cpp warp.cpp core.cpp decode.cpp execute.cpp main.cpp +SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) VPATH := $(sort $(dir $(SRCS))) diff --git a/sim/simX/archdef.h b/sim/simX/archdef.h index 75248c1a..c6728831 100644 --- a/sim/simX/archdef.h +++ b/sim/simX/archdef.h @@ -9,21 +9,31 @@ namespace vortex { -class ArchDef { +class ArchDef { +private: + int num_cores_; + int num_warps_; + int num_threads_; + int wsize_; + int vsize_; + int num_regs_; + int num_csrs_; + int num_barriers_; + public: - ArchDef(const std::string &/*arch*/, + ArchDef(const std::string& /*arch*/, int num_cores, int num_warps, - int num_threads) { - wsize_ = 4; - vsize_ = 16; - num_regs_ = 32; - num_csrs_ = 4096; - num_barriers_= NUM_BARRIERS; - num_cores_ = num_cores; - num_warps_ = num_warps; - num_threads_ = num_threads; - } + int num_threads) + : num_cores_(num_cores) + , num_warps_(num_warps) + , num_threads_(num_threads) + , wsize_(4) + , vsize_(16) + , num_regs_(32) + , num_csrs_(4096) + , num_barriers_(NUM_BARRIERS) + {} int wsize() const { return wsize_; @@ -56,17 +66,6 @@ public: int num_cores() const { return num_cores_; } - -private: - - int wsize_; - int vsize_; - int num_regs_; - int num_csrs_; - int num_barriers_; - int num_threads_; - int num_warps_; - int num_cores_; }; } \ No newline at end of file diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp new file mode 100644 index 00000000..f139cb43 --- /dev/null +++ b/sim/simX/cache.cpp @@ -0,0 +1,497 @@ +#include "cache.h" +#include "debug.h" +#include +#include +#include +#include +#include + +using namespace vortex; + +struct params_t { + uint32_t sets_per_bank; + uint32_t blocks_per_set; + uint32_t words_per_block; + + uint32_t word_select_addr_start; + uint32_t word_select_addr_end; + + uint32_t bank_select_addr_start; + uint32_t bank_select_addr_end; + + uint32_t set_select_addr_start; + uint32_t set_select_addr_end; + + uint32_t tag_select_addr_start; + uint32_t tag_select_addr_end; + + params_t(const CacheConfig& config) { + uint32_t bank_bits = log2ceil(config.num_banks); + uint32_t offset_bits = config.B - config.W; + uint32_t log2_bank_size = config.C - bank_bits; + uint32_t index_bits = log2_bank_size - (config.B << config.A); + assert(log2_bank_size >= config.B); + + + this->words_per_block = 1 << offset_bits; + this->blocks_per_set = 1 << config.A; + this->sets_per_bank = 1 << index_bits; + + assert(config.ports_per_bank <= this->words_per_block); + + // Word select + this->word_select_addr_start = config.W; + this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1); + + // Bank select + this->bank_select_addr_start = (1+this->word_select_addr_end); + this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1); + + // Set select + this->set_select_addr_start = (1+this->bank_select_addr_end); + this->set_select_addr_end = (this->set_select_addr_start+index_bits-1); + + // Tag select + this->tag_select_addr_start = (1+this->set_select_addr_end); + this->tag_select_addr_end = (config.addr_width-1); + } + + uint32_t addr_bank_id(uint64_t word_addr) const { + if (bank_select_addr_end >= bank_select_addr_start) + return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end); + else + return 0; + } + + uint32_t addr_set_id(uint64_t word_addr) const { + if (set_select_addr_end >= set_select_addr_start) + return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end); + else + return 0; + } + + uint64_t addr_tag(uint64_t word_addr) const { + if (tag_select_addr_end >= tag_select_addr_start) + return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end); + else + return 0; + } + + uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const { + uint64_t addr(0); + if (bank_select_addr_end >= bank_select_addr_start) + addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id); + if (set_select_addr_end >= set_select_addr_start) + addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id); + if (tag_select_addr_end >= tag_select_addr_start) + addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag); + return addr; + } +}; + +struct block_t { + bool valid; + bool dirty; + uint64_t tag; + uint32_t lru_ctr; +}; + +struct set_t { + std::vector blocks; + set_t(uint32_t size) : blocks(size) {} +}; + +struct bank_req_info_t { + bool valid; + uint32_t req_id; + uint32_t req_tag; +}; + +struct bank_req_t { + bool valid; + bool write; + bool mshr_replay; + uint64_t tag; + uint32_t set_id; + std::vector infos; + + bank_req_t(uint32_t size) + : valid(false) + , write(false) + , mshr_replay(false) + , tag(0) + , set_id(0) + , infos(size) + {} +}; + +struct mshr_entry_t : public bank_req_t { + uint32_t block_id; + + mshr_entry_t(uint32_t size = 0) + : bank_req_t(size) + , block_id(0) + {} +}; + +class MSHR { +private: + std::vector entries_; + uint32_t capacity_; + +public: + MSHR(uint32_t size) + : entries_(size) + , capacity_(0) + {} + + bool empty() const { + return (0 == capacity_); + } + + bool full() const { + return (capacity_ == entries_.size()); + } + + int lookup(const bank_req_t& bank_req) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (entry.valid + && entry.set_id == bank_req.set_id + && entry.tag == bank_req.tag) { + return i; + } + } + return -1; + } + + int allocate(const bank_req_t& bank_req, uint32_t block_id) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (!entry.valid) { + *(bank_req_t*)&entry = bank_req; + entry.valid = true; + entry.mshr_replay = false; + entry.block_id = block_id; + ++capacity_; + return i; + } + } + return -1; + } + + mshr_entry_t& replay(uint32_t id) { + auto& root_entry = entries_.at(id); + assert(root_entry.valid); + // make all related mshr entries for replay + for (auto& entry : entries_) { + if (entry.valid + && entry.set_id == root_entry.set_id + && entry.tag == root_entry.tag) { + entry.mshr_replay = true; + } + } + return root_entry; + } + + bool try_pop(bank_req_t* out) { + for (auto& entry : entries_) { + if (entry.valid && entry.mshr_replay) { + *out = entry; + entry.valid = false; + --capacity_; + return true; + } + } + return false; + } +}; + +struct bank_t { + std::vector sets; + MSHR mshr; + std::queue stall_buffer; + bank_req_t active_req; + + bank_t(const CacheConfig& config, + const params_t& params) + : sets(params.sets_per_bank, params.blocks_per_set) + , mshr(config.mshr_size) + , active_req(config.ports_per_bank) + {} +}; + +/////////////////////////////////////////////////////////////////////////////// + +class Cache::Impl { +private: + Cache* const simobject_; + CacheConfig config_; + params_t params_; + std::vector banks_; + std::vector> core_reqs_; + std::pair mem_rsp_; + std::vector> core_rsps_; + +public: + Impl(Cache* simobject, const CacheConfig& config) + : simobject_(simobject) + , config_(config) + , params_(config) + , banks_(config.num_banks, {config, params_}) + , core_reqs_(config.num_inputs) + , core_rsps_(config.num_inputs) + {} + + void handleMemResponse(const MemRsp& response, uint32_t) { + mem_rsp_ = {true, response}; + } + + void handleCoreRequest(const MemReq& request, uint32_t port_id) { + core_reqs_.at(port_id) = {true, request}; + } + + void step(uint64_t /*cycle*/) { + // process core response + for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { + auto& core_rsp = core_rsps_.at(req_id); + if (!core_rsp.empty()) { + simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency); + core_rsp.pop(); + } + } + + for (auto& bank : banks_) { + auto& active_req = bank.active_req; + + // try chedule mshr replay + if (!active_req.valid) { + bank.mshr.try_pop(&active_req); + } + + // try schedule stall replay + if (!active_req.valid + && !bank.stall_buffer.empty()) { + active_req = bank.stall_buffer.front(); + bank.stall_buffer.pop(); + } + } + + // handle memory fills + if (mem_rsp_.first) { + mem_rsp_.first = false; + auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15); + auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31); + this->processMemoryFill(bank_id, mshr_id); + } + + // handle incoming core requests + for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) { + auto& entry = core_reqs_.at(i); + if (!entry.first) + continue; + + entry.first = false; + + auto& core_req = entry.second; + auto bank_id = params_.addr_bank_id(core_req.addr); + auto set_id = params_.addr_set_id(core_req.addr); + auto tag = params_.addr_tag(core_req.addr); + auto port_id = i % config_.ports_per_bank; + + // create abnk request + bank_req_t bank_req(config_.ports_per_bank); + bank_req.valid = true; + bank_req.write = core_req.write; + bank_req.mshr_replay = false; + bank_req.tag = tag; + bank_req.set_id = set_id; + bank_req.infos.at(port_id) = {true, i, core_req.tag}; + + auto& bank = banks_.at(bank_id); + + // check MSHR capacity + if (bank.mshr.full()) { + // add to stall buffer + bank.stall_buffer.emplace(bank_req); + continue; + } + + auto& active_req = bank.active_req; + + // check pending MSHR request + if (active_req.valid + && active_req.mshr_replay) { + // add to stall buffer + bank.stall_buffer.emplace(bank_req); + continue; + } + + // check bank conflicts + if (active_req.valid) { + // check port conflict + if (active_req.write != core_req.write + || active_req.set_id != set_id + || active_req.tag != tag + || active_req.infos[port_id].valid) { + // add to stall buffer + bank.stall_buffer.emplace(bank_req); + continue; + } + // update pending request infos + active_req.infos[port_id] = bank_req.infos[port_id]; + } else { + // schedule new request + active_req = bank_req; + } + } + + // process active request + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + this->processBankRequest(bank_id); + } + } + + void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) { + // update block + auto& bank = banks_.at(bank_id); + auto& root_entry = bank.mshr.replay(mshr_id); + auto& set = bank.sets.at(root_entry.set_id); + auto& block = set.blocks.at(root_entry.block_id); + block.valid = true; + block.tag = root_entry.tag; + } + + void processBankRequest(uint32_t bank_id) { + auto& bank = banks_.at(bank_id); + auto& active_req = bank.active_req; + if (!active_req.valid) + return; + + active_req.valid = false; + + auto& set = bank.sets.at(active_req.set_id); + + if (active_req.mshr_replay) { + // send core response + for (auto& info : active_req.infos) { + core_rsps_.at(info.req_id).emplace(info.req_tag); + } + } else { + bool hit = false; + bool found_free_block = false; + int hit_block_id = 0; + int repl_block_id = 0; + uint32_t max_cnt = 0; + + for (int i = 0, n = set.blocks.size(); i < n; ++i) { + auto& block = set.blocks.at(i); + if (block.valid) { + if (block.tag == active_req.tag) { + block.lru_ctr = 0; + hit_block_id = i; + hit = true; + } else { + ++block.lru_ctr; + } + if (max_cnt < block.lru_ctr) { + max_cnt = block.lru_ctr; + repl_block_id = i; + } + } else { + found_free_block = true; + repl_block_id = i; + } + } + + if (hit) { + // + // MISS handling + // + if (active_req.write) { + // handle write hit + auto& hit_block = set.blocks.at(hit_block_id); + if (config_.write_through) { + // forward write request to memory + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag); + mem_req.write = true; + mem_req.tag = 0; + simobject_->MemReqPort.send(mem_req, 1); + } else { + // mark block as dirty + hit_block.dirty = true; + } + } + // send core response + for (auto& info : active_req.infos) { + core_rsps_.at(info.req_id).emplace(info.req_tag); + } + } else { + // + // MISS handling + // + if (!found_free_block && !config_.write_through) { + // write back dirty block + auto& repl_block = set.blocks.at(repl_block_id); + if (repl_block.dirty) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag); + mem_req.write = true; + simobject_->MemReqPort.send(mem_req, 1); + } + } + + if (active_req.write && config_.write_through) { + // forward write request to memory + { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); + mem_req.write = true; + mem_req.tag = 0; + simobject_->MemReqPort.send(mem_req, 1); + } + // send core response + for (auto& info : active_req.infos) { + core_rsps_.at(info.req_id).emplace(info.req_tag); + } + } else { + // lookup + int pending = bank.mshr.lookup(active_req); + + // allocate MSHR + int mshr_id = bank.mshr.allocate(active_req, repl_block_id); + + // send fill request + if (pending == -1) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); + mem_req.write = active_req.write; + mem_req.tag = bit_setw(0, 0, 15, bank_id); + mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id); + simobject_->MemReqPort.send(mem_req, 1); + } + } + } + } + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) + : SimObject(ctx, name) + , impl_(new Impl(this, config)) + , CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest}) + , CoreRspPorts(config.num_inputs, this) + , MemReqPort(this) + , MemRspPort(this, impl_, &Impl::handleMemResponse) +{} + +Cache::~Cache() { + delete impl_; +} + +void Cache::step(uint64_t cycle) { + impl_->step(cycle); +} \ No newline at end of file diff --git a/sim/simX/cache.h b/sim/simX/cache.h new file mode 100644 index 00000000..1c0c82f6 --- /dev/null +++ b/sim/simX/cache.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include "memsim.h" + +namespace vortex { + +struct CacheConfig { + uint8_t C; // log2 cache size + uint8_t B; // log2 block size + uint8_t W; // log2 word size + uint8_t A; // log2 associativity + uint8_t addr_width; // word address bits + uint8_t num_banks; // number of banks + uint8_t ports_per_bank; // number of ports per bank + uint8_t num_inputs; // number of inputs + bool write_through; // is write-through cache + uint16_t victim_size; // victim cache size + uint16_t mshr_size; // MSHR buffer size + uint8_t latency; // pipeline latency +}; + +class Cache : public SimObject { +private: + class Impl; + Impl* impl_; + +public: + Cache(const SimContext& ctx, const char* name, const CacheConfig& config); + ~Cache(); + + void step(uint64_t cycle); + + std::vector> CoreReqPorts; + std::vector> CoreRspPorts; + MasterPort MemReqPort; + SlavePort MemRspPort; +}; + +} \ No newline at end of file diff --git a/sim/simX/constants.h b/sim/simX/constants.h new file mode 100644 index 00000000..d9171b8d --- /dev/null +++ b/sim/simX/constants.h @@ -0,0 +1,21 @@ +#pragma once + +#include "types.h" + +#ifndef MEM_LATENCY +#define MEM_LATENCY 18 +#endif + +namespace vortex { + +struct Constants { + +static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE; +static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1; + +static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2; +static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2; + +}; + +} \ No newline at end of file diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp index c68ac854..af0a4441 100644 --- a/sim/simX/core.cpp +++ b/sim/simX/core.cpp @@ -12,34 +12,92 @@ using namespace vortex; -Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) - : id_(id) +Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) + : SimObject(ctx, "Core") + , id_(id) , arch_(arch) , decoder_(decoder) , mem_(mem) , shared_mem_(1, SMEM_SIZE) - , inst_in_schedule_("schedule") - , inst_in_fetch_("fetch") - , inst_in_decode_("decode") - , inst_in_issue_("issue") - , inst_in_execute_("execute") - , inst_in_writeback_("writeback") { - in_use_iregs_.resize(arch.num_warps(), 0); - in_use_fregs_.resize(arch.num_warps(), 0); - in_use_vregs_.reset(); - - csrs_.resize(arch_.num_csrs(), 0); - - fcsrs_.resize(arch_.num_warps(), 0); - - barriers_.resize(arch_.num_barriers(), 0); - - warps_.resize(arch_.num_warps()); + , warps_(arch.num_warps()) + , barriers_(arch.num_barriers(), 0) + , csrs_(arch.num_csrs(), 0) + , fcsrs_(arch.num_warps(), 0) + , ibuffers_(arch.num_warps(), IBUF_SIZE) + , scoreboard_(arch_) + , exe_units_((int)ExeType::MAX) + , icache_(Cache::Create("Icache", CacheConfig{ + log2ceil(ICACHE_SIZE), // C + log2ceil(L1_BLOCK_SIZE),// B + 2, // W + 0, // A + 32, // address bits + 1, // number of banks + 1, // number of ports + 1, // request size + true, // write-throught + 0, // victim size + NUM_WARPS, // mshr + 2, // pipeline latency + })) + , dcache_(Cache::Create("Dcache", CacheConfig{ + log2ceil(DCACHE_SIZE), // C + log2ceil(L1_BLOCK_SIZE),// B + 2, // W + 0, // A + 32, // address bits + DCACHE_NUM_BANKS, // number of banks + DCACHE_NUM_PORTS, // number of ports + (uint8_t)arch.num_threads(), // request size + true, // write-throught + 0, // victim size + DCACHE_MSHR_SIZE, // mshr + 2, // pipeline latency + })) + , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) + , icache_rsp_port_(this, this, &Core::icache_handleCacheReponse) + , dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse}) + , fetch_stage_("fetch") + , decode_stage_("decode") + , issue_stage_("issue") + , execute_stage_("execute") + , commit_stage_("writeback") + , pending_icache_(arch_.num_warps()) + , stalled_warps_(0) + , last_schedule_wid_(0) + , pending_instrs_(0) + , ebreak_(false) + , stats_insts_(0) + , stats_loads_(0) + , stats_stores_(0) + , MemRspPort(this, &l1_mem_switch_->RspIn) + , MemReqPort(this, &l1_mem_switch_->ReqOut) +{ for (int i = 0; i < arch_.num_warps(); ++i) { - warps_[i] = std::make_shared(this, i); + warps_.at(i) = std::make_shared(this, i); } - this->clear(); + // register execute units + exe_units_.at((int)ExeType::ALU) = std::make_shared(this); + exe_units_.at((int)ExeType::LSU) = std::make_shared(this); + exe_units_.at((int)ExeType::CSR) = std::make_shared(this); + exe_units_.at((int)ExeType::FPU) = std::make_shared(this); + exe_units_.at((int)ExeType::GPU) = std::make_shared(this); + + // connect l1 caches + icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_); + for (int i = 0; i < arch_.num_threads(); ++i) { + dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i)); + } + + // connect l1 switch + icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]); + dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]); + l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort); + l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort); + + // activate warp0 + warps_.at(0)->setTmask(0, true); } Core::~Core() { @@ -51,194 +109,164 @@ Core::~Core() { } } -void Core::clear() { - for (int w = 0; w < arch_.num_warps(); ++w) { - in_use_iregs_[w].reset(); - in_use_fregs_[w].reset(); - } - stalled_warps_.reset(); - - in_use_vregs_.reset(); - - for (auto& csr : csrs_) { - csr = 0; - } - - for (auto& fcsr : fcsrs_) { - fcsr = 0; - } - - for (auto& barrier : barriers_) { - barrier.reset(); - } - - for (auto warp : warps_) { - warp->clear(); - } - - inst_in_schedule_.clear(); - inst_in_fetch_.clear(); - inst_in_decode_.clear(); - inst_in_issue_.clear(); - inst_in_execute_.clear(); - inst_in_writeback_.clear(); - print_bufs_.clear(); - - steps_ = 0; - insts_ = 0; - loads_ = 0; - stores_ = 0; - - inst_in_schedule_.valid = true; - warps_[0]->setTmask(0, true); - - ebreak_ = false; +void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) { + // advance to decode stage + uint32_t wid = response.tag; + pipeline_state_t state; + pending_icache_.remove(wid, &state); + auto latency = (SimPlatform::instance().cycles() - state.icache_latency); + state.icache_latency = latency; + decode_stage_.push(state); } -void Core::step() { +void Core::step(uint64_t cycle) { + __unused (cycle); D(2, "###########################################################"); + D(2, std::dec << "Core" << id_ << ": cycle: " << cycle); - steps_++; - D(2, std::dec << "Core" << id_ << ": cycle: " << steps_); - - this->writeback(); + this->commit(); this->execute(); this->issue(); this->decode(); this->fetch(); - this->schedule(); DPN(2, std::flush); } -void Core::schedule() { - if (!inst_in_schedule_.enter(&inst_in_fetch_)) - return; - +void Core::warp_scheduler() { bool foundSchedule = false; - int scheduled_warp = inst_in_schedule_.wid; + int scheduled_warp = last_schedule_wid_; - for (size_t wid = 0; wid < warps_.size(); ++wid) { - // round robin scheduling + // round robin scheduling + for (size_t wid = 0; wid < warps_.size(); ++wid) { scheduled_warp = (scheduled_warp + 1) % warps_.size(); - bool is_active = warps_[scheduled_warp]->active(); - bool stalled = stalled_warps_[scheduled_warp]; - if (is_active && !stalled) { + bool warp_active = warps_.at(scheduled_warp)->active(); + bool warp_stalled = stalled_warps_.test(scheduled_warp); + if (warp_active && !warp_stalled) { + last_schedule_wid_ = scheduled_warp; foundSchedule = true; break; } } if (!foundSchedule) - return; + return; - D(2, "Schedule: wid=" << scheduled_warp); - inst_in_schedule_.wid = scheduled_warp; + // suspend warp until decode + stalled_warps_.set(scheduled_warp); - // advance pipeline - inst_in_schedule_.next(&inst_in_fetch_); + auto& warp = warps_.at(scheduled_warp); + stats_insts_ += warp->getActiveThreads(); + + pipeline_state_t state; + warp->eval(&state); + + D(4, state); + + // advance to fetch stage + ++pending_instrs_; + fetch_stage_.push(state); } void Core::fetch() { - if (!inst_in_fetch_.enter(&inst_in_issue_)) - return; + // schedule icache request + pipeline_state_t state; + if (fetch_stage_.try_pop(&state)) { + state.icache_latency = SimPlatform::instance().cycles(); + MemReq mem_req; + mem_req.addr = state.PC; + mem_req.write = false; + mem_req.tag = pending_icache_.allocate(state); + icache_->CoreReqPorts.at(0).send(mem_req, 1); + } - int wid = inst_in_fetch_.wid; - - auto active_threads_b = warps_[wid]->getActiveThreads(); - warps_[wid]->step(&inst_in_fetch_); - auto active_threads_a = warps_[wid]->getActiveThreads(); - - insts_ += active_threads_b; - if (active_threads_b != active_threads_a) { - D(3, "*** warp#" << wid << " active threads changed to " << active_threads_a); - } - - if (inst_in_fetch_.stall_warp) { - D(3, "*** warp#" << wid << " fetch stalled"); - stalled_warps_[wid] = true; - } - - D(4, inst_in_fetch_); - - // advance pipeline - inst_in_fetch_.next(&inst_in_issue_); + // schedule next warp + this->warp_scheduler(); } void Core::decode() { - if (!inst_in_decode_.enter(&inst_in_issue_)) - return; + pipeline_state_t state; + if (!decode_stage_.try_pop(&state)) + return; - // advance pipeline - inst_in_decode_.next(&inst_in_issue_); + if (state.stall_warp) { + D(3, "*** warp#" << state.wid << " fetch stalled"); + } else { + // release warp + stalled_warps_.reset(state.wid); + } + + // advance to issue stage + issue_stage_.push(state); } void Core::issue() { - if (!inst_in_issue_.enter(&inst_in_execute_)) - return; + if (!issue_stage_.empty()) { + // insert to ibuffer + auto& state = issue_stage_.top(); + auto& ibuffer = ibuffers_.at(state.wid); + if (!ibuffer.full()) { + ibuffer.push(state); + issue_stage_.pop(); + } + } + + // issue ibuffer instructions + for (auto& ibuffer : ibuffers_) { + if (ibuffer.empty()) + continue; - bool in_use_regs = (inst_in_issue_.used_iregs & in_use_iregs_[inst_in_issue_.wid]) != 0 - || (inst_in_issue_.used_fregs & in_use_fregs_[inst_in_issue_.wid]) != 0 - || (inst_in_issue_.used_vregs & in_use_vregs_) != 0; + auto& state = ibuffer.top(); + + // check scoreboard + if (scoreboard_.in_use(state)) + continue; + + // update scoreboard + scoreboard_.reserve(state); + + // advance to execute stage + execute_stage_.push(state); + + ibuffer.pop(); + break; + } +} + +void Core::execute() { + // process stage inputs + if (!execute_stage_.empty()) { + auto& state = execute_stage_.top(); + auto& exe_unit = exe_units_.at((int)state.exe_type); + exe_unit->push_input(state); + execute_stage_.pop(); + } + + // advance execute units + for (auto& exe_unit : exe_units_) { + exe_unit->step(); + } - if (in_use_regs) { - D(3, "*** Issue: registers not ready!"); - inst_in_issue_.stalled = true; - return; - } - - switch (inst_in_issue_.rdest_type) { - case 1: - if (inst_in_issue_.rdest) - in_use_iregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1; - break; - case 2: - in_use_fregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1; - break; - case 3: - in_use_vregs_[inst_in_issue_.rdest] = 1; - break; - default: - break; + // commit completed instructions + for (auto& exe_unit : exe_units_) { + pipeline_state_t state; + if (exe_unit->pop_output(&state)) { + if (state.stall_warp) { + stalled_warps_.reset(state.wid); + } + // advance to commit stage + commit_stage_.push(state); + } } - - // advance pipeline - inst_in_issue_.next(&inst_in_execute_); } -void Core::execute() { - if (!inst_in_execute_.enter(&inst_in_writeback_)) +void Core::commit() { + pipeline_state_t state; + if (!commit_stage_.try_pop(&state)) return; - // advance pipeline - inst_in_execute_.next(&inst_in_writeback_); -} - -void Core::writeback() { - if (!inst_in_writeback_.enter(NULL)) - return; - - switch (inst_in_writeback_.rdest_type) { - case 1: - in_use_iregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0; - break; - case 2: - in_use_fregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0; - break; - case 3: - in_use_vregs_[inst_in_writeback_.rdest] = 0; - break; - default: - break; - } - - if (inst_in_writeback_.stall_warp) { - stalled_warps_[inst_in_writeback_.wid] = false; - D(3, "*** warp#" << inst_in_writeback_.wid << " fetch released"); - } - - // advance pipeline - inst_in_writeback_.next(NULL); + // update scoreboard + scoreboard_.release(state); } Word Core::get_csr(Addr addr, int tid, int wid) { @@ -281,16 +309,16 @@ Word Core::get_csr(Addr addr, int tid, int wid) { return arch_.num_cores(); } else if (addr == CSR_MINSTRET) { // NumInsts - return insts_; + return stats_insts_; } else if (addr == CSR_MINSTRET_H) { // NumInsts - return (Word)(insts_ >> 32); + return (Word)(stats_insts_ >> 32); } else if (addr == CSR_MCYCLE) { // NumCycles - return (Word)steps_; + return (Word)SimPlatform::instance().cycles(); } else if (addr == CSR_MCYCLE_H) { // NumCycles - return (Word)(steps_ >> 32); + return (Word)(SimPlatform::instance().cycles() >> 32); } else { return csrs_.at(addr); } @@ -328,7 +356,7 @@ Word Core::icache_fetch(Addr addr) { } Word Core::dcache_read(Addr addr, Size size) { - ++loads_; + ++stats_loads_; Word data = 0; #ifdef SM_ENABLE if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) @@ -342,7 +370,7 @@ Word Core::dcache_read(Addr addr, Size size) { } void Core::dcache_write(Addr addr, Word data, Size size) { - ++stores_; + ++stats_stores_; #ifdef SM_ENABLE if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) && ((addr + 3) < SMEM_BASE_ADDR)) { @@ -359,23 +387,19 @@ void Core::dcache_write(Addr addr, Word data, Size size) { } bool Core::running() const { - return inst_in_fetch_.valid - || inst_in_decode_.valid - || inst_in_issue_.valid - || inst_in_execute_.valid - || inst_in_writeback_.valid; + return pending_instrs_; } void Core::printStats() const { - std::cout << "Steps : " << steps_ << std::endl - << "Insts : " << insts_ << std::endl - << "Loads : " << loads_ << std::endl - << "Stores: " << stores_ << std::endl; + std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl + << "Insts : " << stats_insts_ << std::endl + << "Loads : " << stats_loads_ << std::endl + << "Stores: " << stats_stores_ << std::endl; } void Core::writeToStdOut(Addr addr, Word data) { uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1); - auto& ss_buf = print_bufs_[tid]; + auto& ss_buf = print_bufs_.at(tid); char c = (char)data; ss_buf << c; if (c == '\n') { diff --git a/sim/simX/core.h b/sim/simX/core.h index 29de3ec6..913db4a6 100644 --- a/sim/simX/core.h +++ b/sim/simX/core.h @@ -4,10 +4,11 @@ #include #include #include +#include #include #include #include - +#include #include "debug.h" #include "types.h" #include "archdef.h" @@ -15,20 +16,21 @@ #include "mem.h" #include "warp.h" #include "pipeline.h" +#include "cache.h" +#include "ibuffer.h" +#include "scoreboard.h" +#include "exeunit.h" namespace vortex { -class Core { +class Core : public SimObject { public: - Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id); - + Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id); ~Core(); - void clear(); - bool running() const; - void step(); + void step(uint64_t cycle); void printStats() const; @@ -40,7 +42,7 @@ public: return *warps_.at(i); } - Decoder& decoder() { + const Decoder& decoder() { return decoder_; } @@ -48,16 +50,12 @@ public: return arch_; } - unsigned long num_insts() const { - return insts_; - } - - unsigned long num_steps() const { - return steps_; + unsigned long stats_insts() const { + return stats_insts_; } Word getIRegValue(int reg) const { - return warps_[0]->getIRegValue(reg); + return warps_.at(0)->getIRegValue(reg); } Word get_csr(Addr addr, int tid, int wid); @@ -73,50 +71,66 @@ public: void dcache_write(Addr, Word, Size); void trigger_ebreak(); + bool check_ebreak() const; -private: +private: - void schedule(); void fetch(); void decode(); void issue(); void execute(); - void writeback(); + void commit(); + + void warp_scheduler(); + + void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id); void writeToStdOut(Addr addr, Word data); - - std::vector in_use_iregs_; - std::vector in_use_fregs_; - RegMask in_use_vregs_; - WarpMask stalled_warps_; - std::vector> warps_; - std::vector barriers_; - std::vector csrs_; - std::vector fcsrs_; - std::unordered_map print_bufs_; Word id_; - const ArchDef &arch_; - Decoder &decoder_; - MemoryUnit &mem_; + const ArchDef& arch_; + const Decoder& decoder_; + MemoryUnit& mem_; #ifdef SM_ENABLE RAM shared_mem_; #endif + std::vector> warps_; + std::vector barriers_; + std::vector csrs_; + std::vector fcsrs_; + std::vector ibuffers_; + Scoreboard scoreboard_; + std::vector exe_units_; + Cache::Ptr icache_; + Cache::Ptr dcache_; + Switch::Ptr l1_mem_switch_; + SlavePort icache_rsp_port_; + std::vector> dcache_rsp_port_; + + PipelineStage fetch_stage_; + PipelineStage decode_stage_; + PipelineStage issue_stage_; + PipelineStage execute_stage_; + PipelineStage commit_stage_; + + HashTable pending_icache_; + WarpMask stalled_warps_; + uint32_t last_schedule_wid_; + uint32_t pending_instrs_; bool ebreak_; - Pipeline inst_in_schedule_; - Pipeline inst_in_fetch_; - Pipeline inst_in_decode_; - Pipeline inst_in_issue_; - Pipeline inst_in_execute_; - Pipeline inst_in_writeback_; + std::unordered_map print_bufs_; + uint64_t stats_insts_; + uint64_t stats_loads_; + uint64_t stats_stores_; - uint64_t steps_; - uint64_t insts_; - uint64_t loads_; - uint64_t stores_; + friend class LsuUnit; + +public: + SlavePort MemRspPort; + MasterPort MemReqPort; }; } // namespace vortex \ No newline at end of file diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index dbc7115a..3c76231f 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -281,7 +281,7 @@ Decoder::Decoder(const ArchDef &arch) { v_imm_mask_ = 0x7ff; } -std::shared_ptr Decoder::decode(Word code, Word PC) { +std::shared_ptr Decoder::decode(Word code, Word PC) const { auto instr = std::make_shared(); Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_); instr->setOpcode(op); @@ -351,9 +351,9 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { instr->setFunc3(func3); instr->setFunc7(func7); if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) { - instr->setImm(signExt(rs2, 5, reg_mask_)); + instr->setImm(sext32(rs2, 5)); } else { - instr->setImm(signExt(code >> shift_rs2_, 12, i_imm_mask_)); + instr->setImm(sext32(code >> shift_rs2_, 12)); } } break; @@ -366,7 +366,7 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { } instr->setFunc3(func3); Word imeed = (func7 << reg_s_) | rd; - instr->setImm(signExt(imeed, 12, s_imm_mask_)); + instr->setImm(sext32(imeed, 12)); } break; case InstType::B_TYPE: { @@ -378,12 +378,12 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { Word bit_10_5 = func7 & 0x3f; Word bit_12 = func7 >> 6; Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); - instr->setImm(signExt(imeed, 13, b_imm_mask_)); + instr->setImm(sext32(imeed, 13)); } break; case InstType::U_TYPE: instr->setDestReg(rd); - instr->setImm(signExt(code >> shift_func3_, 20, u_imm_mask_)); + instr->setImm(sext32(code >> shift_func3_, 20)); break; case InstType::J_TYPE: { diff --git a/sim/simX/decode.h b/sim/simX/decode.h index f8f3909c..d4f9f976 100644 --- a/sim/simX/decode.h +++ b/sim/simX/decode.h @@ -13,7 +13,7 @@ class Decoder { public: Decoder(const ArchDef &); - std::shared_ptr decode(Word code, Word PC); + std::shared_ptr decode(Word code, Word PC) const; private: diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index 47bf4e04..602f7f3a 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -49,346 +49,445 @@ inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) } } -void Warp::execute(const Instr &instr, Pipeline *pipeline) { +void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { assert(tmask_.any()); Word nextPC = PC_ + core_->arch().wsize(); - bool runOnce = false; - - Word func3 = instr.getFunc3(); - Word func6 = instr.getFunc6(); - Word func7 = instr.getFunc7(); + + Word func3 = instr.getFunc3(); + Word func6 = instr.getFunc6(); + Word func7 = instr.getFunc7(); auto opcode = instr.getOpcode(); - int rdest = instr.getRDest(); - int rsrc0 = instr.getRSrc(0); - int rsrc1 = instr.getRSrc(1); - Word immsrc= instr.getImm(); - Word vmask = instr.getVmask(); + int rdest = instr.getRDest(); + int rsrc0 = instr.getRSrc(0); + int rsrc1 = instr.getRSrc(1); + int rsrc2 = instr.getRSrc(2); + Word immsrc = instr.getImm(); + Word vmask = instr.getVmask(); int num_threads = core_->arch().num_threads(); - for (int t = 0; t < num_threads; t++) { - if (!tmask_.test(t) || runOnce) - continue; - - auto &iregs = iRegFile_.at(t); - auto &fregs = fRegFile_.at(t); - Word rsdata[3]; - Word rddata; - - int num_rsrcs = instr.getNRSrc(); - if (num_rsrcs) { - DPH(2, "[" << std::dec << t << "] Src Regs: "); - for (int i = 0; i < num_rsrcs; ++i) { - int rst = instr.getRSType(i); - int rs = instr.getRSrc(i); - if (i) DPN(2, ", "); - switch (rst) { - case 1: - rsdata[i] = iregs[rs]; - DPN(2, "r" << std::dec << rs << "=0x" << std::hex << rsdata[i]); - break; - case 2: - rsdata[i] = fregs[rs]; - DPN(2, "fr" << std::dec << rs << "=0x" << std::hex << rsdata[i]); - break; - default: break; - } - } - DPN(2, std::endl); - } - - bool rd_write = false; + std::vector rsdata(num_threads); + std::vector rddata(num_threads); - switch (opcode) { - case NOP: - break; - case LUI_INST: - rddata = (immsrc << 12) & 0xfffff000; - rd_write = true; - break; - case AUIPC_INST: - rddata = ((immsrc << 12) & 0xfffff000) + PC_; - rd_write = true; - break; - case R_INST: { + int num_rsrcs = instr.getNRSrc(); + if (num_rsrcs) { + for (int i = 0; i < num_rsrcs; ++i) { + DPH(2, "Src Reg [" << std::dec << i << "]: "); + int type = instr.getRSType(i); + int reg = instr.getRSrc(i); + switch (type) { + case 1: + DPH(2, "r" << std::dec << reg << "={"); + for (int t = 0; t < num_threads; ++t) { + if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + rsdata[t][i] = iRegFile_.at(t)[reg]; + DPN(2, std::hex << rsdata[t][i]); + } + DPN(2, "}" << std::endl); + break; + case 2: + DPH(2, "fr" << std::dec << reg << "={"); + for (int t = 0; t < num_threads; ++t) { + if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + rsdata[t][i] = fRegFile_.at(t)[reg]; + DPN(2, std::hex << rsdata[t][i]); + } + DPN(2, "}" << std::endl); + break; + default: + break; + } + } + } + + bool rd_write = false; + + switch (opcode) { + case NOP: + break; + case LUI_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::ARITH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = (immsrc << 12) & 0xfffff000; + } + rd_write = true; + break; + case AUIPC_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::ARITH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = ((immsrc << 12) & 0xfffff000) + PC_; + } + rd_write = true; + break; + case R_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::ARITH; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; if (func7 & 0x1) { switch (func3) { case 0: // MUL - rddata = ((WordI)rsdata[0]) * ((WordI)rsdata[1]); + rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]); + pipeline_state->alu.type = AluType::IMUL; break; case 1: { // MULH - int64_t first = (int64_t)rsdata[0]; - if (rsdata[0] & 0x80000000) { + int64_t first = (int64_t)rsdata[t][0]; + if (rsdata[t][0] & 0x80000000) { first = first | 0xFFFFFFFF00000000; } - int64_t second = (int64_t)rsdata[1]; - if (rsdata[1] & 0x80000000) { + int64_t second = (int64_t)rsdata[t][1]; + if (rsdata[t][1] & 0x80000000) { second = second | 0xFFFFFFFF00000000; } uint64_t result = first * second; - rddata = (result >> 32) & 0xFFFFFFFF; + rddata[t] = (result >> 32) & 0xFFFFFFFF; + pipeline_state->alu.type = AluType::IMUL; } break; case 2: { // MULHSU - int64_t first = (int64_t)rsdata[0]; - if (rsdata[0] & 0x80000000) { + int64_t first = (int64_t)rsdata[t][0]; + if (rsdata[t][0] & 0x80000000) { first = first | 0xFFFFFFFF00000000; } - int64_t second = (int64_t)rsdata[1]; - rddata = ((first * second) >> 32) & 0xFFFFFFFF; + int64_t second = (int64_t)rsdata[t][1]; + rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; + pipeline_state->alu.type = AluType::IMUL; } break; case 3: { // MULHU - uint64_t first = (uint64_t)rsdata[0]; - uint64_t second = (uint64_t)rsdata[1]; - rddata = ((first * second) >> 32) & 0xFFFFFFFF; + uint64_t first = (uint64_t)rsdata[t][0]; + uint64_t second = (uint64_t)rsdata[t][1]; + rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; + pipeline_state->alu.type = AluType::IMUL; } break; case 4: { // DIV - WordI dividen = rsdata[0]; - WordI divisor = rsdata[1]; + WordI dividen = rsdata[t][0]; + WordI divisor = rsdata[t][1]; if (divisor == 0) { - rddata = -1; + rddata[t] = -1; } else if (dividen == WordI(0x80000000) && divisor == WordI(0xffffffff)) { - rddata = dividen; + rddata[t] = dividen; } else { - rddata = dividen / divisor; + rddata[t] = dividen / divisor; } + pipeline_state->alu.type = AluType::IDIV; } break; case 5: { // DIVU - Word dividen = rsdata[0]; - Word divisor = rsdata[1]; + Word dividen = rsdata[t][0]; + Word divisor = rsdata[t][1]; if (divisor == 0) { - rddata = -1; + rddata[t] = -1; } else { - rddata = dividen / divisor; + rddata[t] = dividen / divisor; } + pipeline_state->alu.type = AluType::IDIV; } break; case 6: { // REM - WordI dividen = rsdata[0]; - WordI divisor = rsdata[1]; - if (rsdata[1] == 0) { - rddata = dividen; + WordI dividen = rsdata[t][0]; + WordI divisor = rsdata[t][1]; + if (rsdata[t][1] == 0) { + rddata[t] = dividen; } else if (dividen == WordI(0x80000000) && divisor == WordI(0xffffffff)) { - rddata = 0; + rddata[t] = 0; } else { - rddata = dividen % divisor; + rddata[t] = dividen % divisor; } + pipeline_state->alu.type = AluType::IDIV; } break; case 7: { // REMU - Word dividen = rsdata[0]; - Word divisor = rsdata[1]; - if (rsdata[1] == 0) { - rddata = dividen; + Word dividen = rsdata[t][0]; + Word divisor = rsdata[t][1]; + if (rsdata[t][1] == 0) { + rddata[t] = dividen; } else { - rddata = dividen % divisor; + rddata[t] = dividen % divisor; } + pipeline_state->alu.type = AluType::IDIV; } break; default: - std::cout << "unsupported MUL/DIV instr\n"; std::abort(); } } else { switch (func3) { case 0: if (func7) { - rddata = rsdata[0] - rsdata[1]; + // SUB + rddata[t] = rsdata[t][0] - rsdata[t][1]; } else { - rddata = rsdata[0] + rsdata[1]; + // ADD + rddata[t] = rsdata[t][0] + rsdata[t][1]; } break; case 1: - rddata = rsdata[0] << rsdata[1]; + // SHL + rddata[t] = rsdata[t][0] << rsdata[t][1]; break; case 2: - rddata = (WordI(rsdata[0]) < WordI(rsdata[1])); + // LT + rddata[t] = (WordI(rsdata[t][0]) < WordI(rsdata[t][1])); break; case 3: - rddata = (Word(rsdata[0]) < Word(rsdata[1])); + // LTU + rddata[t] = (Word(rsdata[t][0]) < Word(rsdata[t][1])); break; case 4: - rddata = rsdata[0] ^ rsdata[1]; + // XOR + rddata[t] = rsdata[t][0] ^ rsdata[t][1]; break; case 5: if (func7) { - rddata = WordI(rsdata[0]) >> WordI(rsdata[1]); + // SRA + rddata[t] = WordI(rsdata[t][0]) >> WordI(rsdata[t][1]); } else { - rddata = Word(rsdata[0]) >> Word(rsdata[1]); + // SHR + rddata[t] = Word(rsdata[t][0]) >> Word(rsdata[t][1]); } break; case 6: - rddata = rsdata[0] | rsdata[1]; + // OR + rddata[t] = rsdata[t][0] | rsdata[t][1]; break; case 7: - rddata = rsdata[0] & rsdata[1]; + // AND + rddata[t] = rsdata[t][0] & rsdata[t][1]; break; default: std::abort(); } } - rd_write = true; - } break; - case I_INST: + } + rd_write = true; + break; + case I_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::ARITH; + pipeline_state->used_iregs[rsrc0] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; switch (func3) { case 0: // ADDI - rddata = rsdata[0] + immsrc; + rddata[t] = rsdata[t][0] + immsrc; break; case 1: // SLLI - rddata = rsdata[0] << immsrc; + rddata[t] = rsdata[t][0] << immsrc; break; case 2: // SLTI - rddata = (WordI(rsdata[0]) < WordI(immsrc)); + rddata[t] = (WordI(rsdata[t][0]) < WordI(immsrc)); break; case 3: { // SLTIU - rddata = (Word(rsdata[0]) < Word(immsrc)); + rddata[t] = (Word(rsdata[t][0]) < Word(immsrc)); } break; case 4: // XORI - rddata = rsdata[0] ^ immsrc; + rddata[t] = rsdata[t][0] ^ immsrc; break; case 5: if (func7) { // SRAI - Word result = WordI(rsdata[0]) >> immsrc; - rddata = result; + Word result = WordI(rsdata[t][0]) >> immsrc; + rddata[t] = result; } else { // SRLI - Word result = Word(rsdata[0]) >> immsrc; - rddata = result; + Word result = Word(rsdata[t][0]) >> immsrc; + rddata[t] = result; } break; case 6: // ORI - rddata = rsdata[0] | immsrc; + rddata[t] = rsdata[t][0] | immsrc; break; case 7: // ANDI - rddata = rsdata[0] & immsrc; + rddata[t] = rsdata[t][0] & immsrc; break; - default: - std::abort(); } - rd_write = true; - break; - case B_INST: + } + rd_write = true; + break; + case B_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::BRANCH; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; switch (func3) { case 0: // BEQ - if (rsdata[0] == rsdata[1]) { + if (rsdata[t][0] == rsdata[t][1]) { nextPC = PC_ + immsrc; } break; case 1: // BNE - if (rsdata[0] != rsdata[1]) { + if (rsdata[t][0] != rsdata[t][1]) { nextPC = PC_ + immsrc; } break; case 4: // BLT - if (WordI(rsdata[0]) < WordI(rsdata[1])) { + if (WordI(rsdata[t][0]) < WordI(rsdata[t][1])) { nextPC = PC_ + immsrc; } break; case 5: // BGE - if (WordI(rsdata[0]) >= WordI(rsdata[1])) { + if (WordI(rsdata[t][0]) >= WordI(rsdata[t][1])) { nextPC = PC_ + immsrc; } break; case 6: // BLTU - if (Word(rsdata[0]) < Word(rsdata[1])) { + if (Word(rsdata[t][0]) < Word(rsdata[t][1])) { nextPC = PC_ + immsrc; } break; case 7: // BGEU - if (Word(rsdata[0]) >= Word(rsdata[1])) { + if (Word(rsdata[t][0]) >= Word(rsdata[t][1])) { nextPC = PC_ + immsrc; } break; - } - pipeline->stall_warp = true; - runOnce = true; - break; - case JAL_INST: - rddata = nextPC; - nextPC = PC_ + immsrc; - pipeline->stall_warp = true; - runOnce = true; - rd_write = true; - break; - case JALR_INST: - rddata = nextPC; - nextPC = rsdata[0] + immsrc; - pipeline->stall_warp = true; - runOnce = true; - rd_write = true; - break; - case L_INST: { - Word memAddr = ((rsdata[0] + immsrc) & 0xFFFFFFFC); // word aligned - Word shift_by = ((rsdata[0] + immsrc) & 0x00000003) * 8; - Word data_read = core_->dcache_read(memAddr, 4); - D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - switch (func3) { - case 0: - // LBI - rddata = signExt((data_read >> shift_by) & 0xFF, 8, 0xFF); - break; - case 1: - // LHI - rddata = signExt((data_read >> shift_by) & 0xFFFF, 16, 0xFFFF); - break; - case 2: - // LW - rddata = data_read; - break; - case 4: - // LBU - rddata = Word((data_read >> shift_by) & 0xFF); - break; - case 5: - // LHU - rddata = Word((data_read >> shift_by) & 0xFFFF); - break; - default: - std::abort(); - } - rd_write = true; - } break; - case S_INST: { - Word memAddr = rsdata[0] + immsrc; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (func3) { - case 0: - // SB - core_->dcache_write(memAddr, rsdata[1] & 0x000000FF, 1); - break; - case 1: - // SH - core_->dcache_write(memAddr, rsdata[1], 2); - break; - case 2: - // SW - core_->dcache_write(memAddr, rsdata[1], 4); - break; default: std::abort(); } - } break; - case SYS_INST: { + break; // runonce + } + pipeline_state->stall_warp = true; + break; + case JAL_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::BRANCH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = nextPC; + nextPC = PC_ + immsrc; + pipeline_state->stall_warp = true; + break; // runonce + } + rd_write = true; + break; + case JALR_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::BRANCH; + pipeline_state->used_iregs[rsrc0] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = nextPC; + nextPC = rsdata[t][0] + immsrc; + pipeline_state->stall_warp = true; + break; // runOnce + } + rd_write = true; + break; + case L_INST: + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.load = 0; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->mem_addrs.resize(num_threads); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned + Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; + Word data_read = core_->dcache_read(memAddr, 4); + pipeline_state->mem_addrs.at(t) = memAddr; + D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + switch (func3) { + case 0: + // LBI + rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8); + break; + case 1: + // LHI + rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16); + break; + case 2: + // LW + rddata[t] = data_read; + break; + case 4: + // LBU + rddata[t] = Word((data_read >> shift_by) & 0xFF); + break; + case 5: + // LHU + rddata[t] = Word((data_read >> shift_by) & 0xFFFF); + break; + default: + std::abort(); + } + } + rd_write = true; + break; + case S_INST: + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.store = 1; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->mem_addrs.resize(num_threads); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = rsdata[t][0] + immsrc; + pipeline_state->mem_addrs.at(t) = memAddr; + D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + switch (func3) { + case 0: + // SB + core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1); + break; + case 1: + // SH + core_->dcache_write(memAddr, rsdata[t][1], 2); + break; + case 2: + // SW + core_->dcache_write(memAddr, rsdata[t][1], 4); + break; + default: + std::abort(); + } + } + break; + case SYS_INST: + pipeline_state->exe_type = ExeType::CSR; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; Word csr_addr = immsrc & 0x00000FFF; Word csr_value = core_->get_csr(csr_addr, t, id_); switch (func3) { @@ -400,229 +499,306 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { break; case 1: // CSRRW - rddata = csr_value; - core_->set_csr(csr_addr, rsdata[0], t, id_); + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsdata[t][0], t, id_); + pipeline_state->used_iregs[rsrc0] = 1; rd_write = true; break; case 2: // CSRRS - rddata = csr_value; - core_->set_csr(csr_addr, csr_value | rsdata[0], t, id_); + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); + pipeline_state->used_iregs[rsrc0] = 1; rd_write = true; break; case 3: // CSRRC - rddata = csr_value; - core_->set_csr(csr_addr, csr_value & ~rsdata[0], t, id_); + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); + pipeline_state->used_iregs[rsrc0] = 1; rd_write = true; break; case 5: // CSRRWI - rddata = csr_value; - core_->set_csr(csr_addr, rsrc0, t, id_); + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsrc0, t, id_); rd_write = true; break; case 6: // CSRRSI - rddata = csr_value; + rddata[t] = csr_value; core_->set_csr(csr_addr, csr_value | rsrc0, t, id_); rd_write = true; break; case 7: // CSRRCI - rddata = csr_value; + rddata[t] = csr_value; core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_); rd_write = true; break; default: break; } - } break; - case FENCE: - pipeline->stall_warp = true; - runOnce = true; - break; - case (FL | VL): - if (func3 == 0x2) { - Word memAddr = rsdata[0] + immsrc; + } + break; + case FENCE: + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.fence = 1; + pipeline_state->stall_warp = true; + break; + case (FL | VL): + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.load = 1; + pipeline_state->used_iregs[rsrc0] = 1; + if (func3 == 0x2) { + pipeline_state->mem_addrs.resize(num_threads); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = rsdata[t][0] + immsrc; + pipeline_state->mem_addrs.at(t) = memAddr; Word data_read = core_->dcache_read(memAddr, 4); D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - rddata = data_read; - } else { - D(3, "Executing vector load"); - D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); - D(3, "src: " << rsrc0 << " " << rsdata[0]); - D(3, "dest" << rdest); - D(3, "width" << instr.getVlsWidth()); - - auto &vd = vRegFile_[rdest]; - + rddata[t] = data_read; + } + } else { + D(3, "Executing vector load"); + D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); + D(3, "dest: v" << rdest); + D(3, "width" << instr.getVlsWidth()); + pipeline_state->mem_addrs.resize(vl_); + auto &vd = vRegFile_.at(rdest); + switch (instr.getVlsWidth()) { + case 6: { + // load word and unit strided (not checking for unit stride) + for (int i = 0; i < vl_; i++) { + Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); + pipeline_state->mem_addrs.at(i) = memAddr; + D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + Word data_read = core_->dcache_read(memAddr, 4); + D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + int *result_ptr = (int *)(vd.data() + i); + *result_ptr = data_read; + } + } break; + default: + std::abort(); + } + break; + } + rd_write = true; + break; + case (FS | VS): + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.store = 1; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + if (func3 == 0x2) { + pipeline_state->mem_addrs.resize(num_threads); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = rsdata[t][0] + immsrc; + pipeline_state->mem_addrs.at(t) = memAddr; + core_->dcache_write(memAddr, rsdata[t][1], 4); + D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + } + } else { + pipeline_state->mem_addrs.resize(vl_); + for (int i = 0; i < vl_; i++) { + Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); + pipeline_state->mem_addrs.at(i) = memAddr; + D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); switch (instr.getVlsWidth()) { - case 6: { - //load word and unit strided (not checking for unit stride) - for (int i = 0; i < vl_; i++) { - Word memAddr = ((rsdata[0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - Word data_read = core_->dcache_read(memAddr, 4); - D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); - int *result_ptr = (int *)(vd.data() + i); - *result_ptr = data_read; - } + case 6: { + //store word and unit strided (not checking for unit stride) + uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); + core_->dcache_write(memAddr, value, 4); + D(3, "store: " << memAddr << " value:" << value); } break; default: std::abort(); - } - break; - } - rd_write = true; - break; - case (FS | VS): - if (func3 == 0x2) { - Word memAddr = rsdata[0] + immsrc; - core_->dcache_write(memAddr, rsdata[1], 4); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - } else { - for (int i = 0; i < vl_; i++) { - Word memAddr = rsdata[0] + (i * vtype_.vsew / 8); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (instr.getVlsWidth()) { - case 6: { - //store word and unit strided (not checking for unit stride) - uint32_t value = *(uint32_t *)(vRegFile_[instr.getVs3()].data() + i); - core_->dcache_write(memAddr, value, 4); - D(3, "store: " << memAddr << " value:" << value); - } break; - default: - std::abort(); - } - } + } } - break; - case FCI: { + } + break; + case FCI: + pipeline_state->exe_type = ExeType::FPU; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; uint32_t frm = get_fpu_rm(func3, core_, t, id_); uint32_t fflags = 0; switch (func7) { case 0x00: //FADD - rddata = rv_fadd(rsdata[0], rsdata[1], frm, &fflags); + rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags); + pipeline_state->fpu.type = FpuType::FMA; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x04: //FSUB - rddata = rv_fsub(rsdata[0], rsdata[1], frm, &fflags); + rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags); + pipeline_state->fpu.type = FpuType::FMA; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x08: //FMUL - rddata = rv_fmul(rsdata[0], rsdata[1], frm, &fflags); + rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags); + pipeline_state->fpu.type = FpuType::FMA; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x0c: //FDIV - rddata = rv_fdiv(rsdata[0], rsdata[1], frm, &fflags); + rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags); + pipeline_state->fpu.type = FpuType::FDIV; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x2c: //FSQRT - rddata = rv_fsqrt(rsdata[0], frm, &fflags); + rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags); + pipeline_state->fpu.type = FpuType::FSQRT; + pipeline_state->used_fregs[rsrc0] = 1; break; case 0x10: switch (func3) { case 0: // FSGNJ.S - rddata = rv_fsgnj(rsdata[0], rsdata[1]); + rddata[t] = rv_fsgnj(rsdata[t][0], rsdata[t][1]); break; case 1: // FSGNJN.S - rddata = rv_fsgnjn(rsdata[0], rsdata[1]); + rddata[t] = rv_fsgnjn(rsdata[t][0], rsdata[t][1]); break; case 2: // FSGNJX.S - rddata = rv_fsgnjx(rsdata[0], rsdata[1]); + rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]); break; } + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; - case 0x14: + case 0x14: if (func3) { // FMAX.S - rddata = rv_fmax(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_fmax(rsdata[t][0], rsdata[t][1], &fflags); } else { // FMIN.S - rddata = rv_fmin(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags); } + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x60: if (rsrc1 == 0) { // FCVT.W.S - rddata = rv_ftoi(rsdata[0], frm, &fflags); + rddata[t] = rv_ftoi(rsdata[t][0], frm, &fflags); } else { // FCVT.WU.S - rddata = rv_ftou(rsdata[0], frm, &fflags); + rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags); } + pipeline_state->fpu.type = FpuType::FCVT; + pipeline_state->used_fregs[rsrc0] = 1; break; case 0x70: if (func3) { // FCLASS.S - rddata = rv_fclss(rsdata[0]); + rddata[t] = rv_fclss(rsdata[t][0]); } else { // FMV.X.W - rddata = rsdata[0]; + rddata[t] = rsdata[t][0]; + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; } break; - case 0x50: + case 0x50: switch(func3) { case 0: // FLE.S - rddata = rv_fle(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_fle(rsdata[t][0], rsdata[t][1], &fflags); break; case 1: // FLT.S - rddata = rv_flt(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_flt(rsdata[t][0], rsdata[t][1], &fflags); break; case 2: // FEQ.S - rddata = rv_feq(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags); break; - } break; + } + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; + break; case 0x68: if (rsrc1) { // FCVT.S.WU: - rddata = rv_utof(rsdata[0], frm, &fflags); + rddata[t] = rv_utof(rsdata[t][0], frm, &fflags); } else { // FCVT.S.W: - rddata = rv_itof(rsdata[0], frm, &fflags); + rddata[t] = rv_itof(rsdata[t][0], frm, &fflags); } + pipeline_state->fpu.type = FpuType::FCVT; + pipeline_state->used_iregs[rsrc0] = 1; break; case 0x78: // FMV.W.X - rddata = rsdata[0]; + rddata[t] = rsdata[t][0]; + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_iregs[rsrc0] = 1; break; } update_fcrs(fflags, core_, t, id_); - rd_write = true; - } break; - case FMADD: - case FMSUB: - case FMNMADD: - case FMNMSUB: { + } + rd_write = true; + break; + case FMADD: + case FMSUB: + case FMNMADD: + case FMNMSUB: + pipeline_state->fpu.type = FpuType::FMA; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; + pipeline_state->used_fregs[rsrc2] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; int frm = get_fpu_rm(func3, core_, t, id_); Word fflags = 0; switch (opcode) { case FMADD: - rddata = rv_fmadd(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); + rddata[t] = rv_fmadd(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); break; case FMSUB: - rddata = rv_fmsub(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); + rddata[t] = rv_fmsub(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); break; case FMNMADD: - rddata = rv_fnmadd(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); + rddata[t] = rv_fnmadd(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); break; case FMNMSUB: - rddata = rv_fnmsub(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); + rddata[t] = rv_fnmsub(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); break; default: break; } update_fcrs(fflags, core_, t, id_); - rd_write = true; - } break; - case GPGPU: + } + rd_write = true; + break; + case GPGPU: + pipeline_state->exe_type = ExeType::GPU; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; switch (func3) { case 0: { // TMC + pipeline_state->gpu.type = GpuType::TMC; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->stall_warp = true; if (rsrc1) { // predicate mode ThreadMask pred; for (int i = 0; i < num_threads; ++i) { - pred[i] = tmask_[i] ? (iRegFile_[i][rsrc0] != 0) : 0; + pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0; } if (pred.any()) { tmask_ &= pred; @@ -630,58 +806,64 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { } else { tmask_.reset(); for (int i = 0; i < num_threads; ++i) { - tmask_[i] = rsdata[0] & (1 << i); + tmask_.set(i, rsdata.at(t)[0] & (1 << i)); } } D(3, "*** TMC " << tmask_); active_ = tmask_.any(); - pipeline->stall_warp = true; - runOnce = true; + break; // runOnce } break; case 1: { // WSPAWN - int active_warps = std::min(rsdata[0], core_->arch().num_warps()); - D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata[1]); + pipeline_state->gpu.type = GpuType::WSPAWN; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->stall_warp = true; + int active_warps = std::min(rsdata.at(t)[0], core_->arch().num_warps()); + D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(t)[1]); for (int i = 1; i < active_warps; ++i) { Warp &newWarp = core_->warp(i); - newWarp.setPC(rsdata[1]); + newWarp.setPC(rsdata[t][1]); newWarp.setTmask(0, true); } - pipeline->stall_warp = true; - runOnce = true; + break; // runOnce } break; case 2: { // SPLIT + pipeline_state->gpu.type = GpuType::SPLIT; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->stall_warp = true; if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { ThreadMask tmask; for (int i = 0; i < num_threads; ++i) { - tmask[i] = tmask_[i] && !iRegFile_[i][rsrc0]; + tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0); } DomStackEntry e(tmask, nextPC); domStack_.push(tmask_); domStack_.push(e); for (size_t i = 0; i < e.tmask.size(); ++i) { - tmask_[i] = !e.tmask[i] && tmask_[i]; + tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); } active_ = tmask_.any(); DPH(3, "*** Split: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_[num_threads-i-1]); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); DPN(3, ", Pushed TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask[num_threads-i-1]); + for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1)); DPN(3, ", PC=0x" << std::hex << e.PC << "\n"); } else { D(3, "*** Unanimous pred"); DomStackEntry e(tmask_); e.unanimous = true; domStack_.push(e); - } - pipeline->stall_warp = true; - runOnce = true; + } + break; // runOnce } break; case 3: { // JOIN + pipeline_state->gpu.type = GpuType::JOIN; + pipeline_state->stall_warp = true; if (!domStack_.empty() && domStack_.top().unanimous) { D(3, "*** Uninimous branch at join"); tmask_ = domStack_.top().tmask; @@ -697,898 +879,923 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { active_ = tmask_.any(); DPH(3, "*** Join: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_[num_threads-i-1]); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); DPN(3, "\n"); domStack_.pop(); - } - pipeline->stall_warp = true; - runOnce = true; + } + break; // runOnce } break; case 4: { // BAR + pipeline_state->gpu.type = GpuType::BAR; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->stall_warp = true; active_ = false; - core_->barrier(rsdata[0], rsdata[1], id_); - pipeline->stall_warp = true; - runOnce = true; + core_->barrier(rsdata[t][0], rsdata[t][1], id_); + break; // runOnce } break; case 6: { // PREFETCH - int addr = rsdata[0]; + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.prefetch = 1; + pipeline_state->used_iregs[rsrc0] = 1; + int addr = rsdata[t][0]; printf("*** PREFETCHED %d ***\n", addr); } break; default: std::abort(); } - break; - case VSET: { - int VLEN = core_->arch().vsize() * 8; - int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); - switch (func3) { - case 0: // vector-vector - switch (func6) { - case 0: { - auto& vr1 = vRegFile_[rsrc0]; - auto& vr2 = vRegFile_[rsrc1]; - auto& vd = vRegFile_[rdest]; - auto& mask = vRegFile_[0]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t emask = *(uint8_t *)(mask.data() + i); - uint8_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t emask = *(uint16_t *)(mask.data() + i); - uint16_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t emask = *(uint32_t *)(mask.data() + i); - uint32_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } - } break; - case 24: { - //vmseq - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { + } + break; + case VSET: { + int VLEN = core_->arch().vsize() * 8; + int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); + switch (func3) { + case 0: // vector-vector + switch (func6) { + case 0: { + auto& vr1 = vRegFile_.at(rsrc0); + auto& vr2 = vRegFile_.at(rsrc1); + auto& vd = vRegFile_.at(rdest); + auto& mask = vRegFile_.at(0); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t emask = *(uint8_t *)(mask.data() + i); + uint8_t value = emask & 0x1; + if (vmask || (!vmask && value)) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + uint8_t result = first + second; + D(3, "Adding " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t emask = *(uint16_t *)(mask.data() + i); + uint16_t value = emask & 0x1; + if (vmask || (!vmask && value)) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + uint16_t result = first + second; + D(3, "Adding " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t emask = *(uint32_t *)(mask.data() + i); + uint32_t value = emask & 0x1; + if (vmask || (!vmask && value)) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + uint32_t result = first + second; + D(3, "Adding " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } - } break; - case 25: { - //vmsne - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } + } + } break; + case 24: { + //vmseq + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first == second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } - } break; - case 26: { - //vmsltu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first == second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } - } break; - case 27: { - //vmslt - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first == second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } - } break; - case 28: { - //vmsleu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 29: { - //vmsle - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } - } - } break; - case 30: { - //vmsgtu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 31: { - //vmsgt - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } - } - } break; - } - break; - case 2: { - switch (func6) { - case 24: { - // vmandnot - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 25: { - // vmand - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 26: { - // vmor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 27: { - //vmxor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 28: { - //vmornot - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 29: { - //vmnand - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 30: { - //vmnor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 31: { - //vmxnor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 37: { - //vmul - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 45: { - // vmacc - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; } } break; - case 6: { - switch (func6) { - case 0: { - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } + case 25: { + //vmsne + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first != second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } - } break; - case 37: { - // vmul.vx - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first != second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first != second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } - } break; } } break; - case 7: { - vtype_.vill = 0; - vtype_.vediv = instr.getVediv(); - vtype_.vsew = instr.getVsew(); - vtype_.vlmul = instr.getVlmul(); - - D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0] << "VLMAX" << VLMAX); - - int s0 = rsdata[0]; - if (s0 <= VLMAX) { - vl_ = s0; - } else if (s0 < (2 * VLMAX)) { - vl_ = (int)ceil((s0 * 1.0) / 2.0); - } else if (s0 >= (2 * VLMAX)) { - vl_ = VLMAX; - } - rddata = vl_; + case 26: { + //vmsltu + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 27: { + //vmslt + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } + } break; + case 28: { + //vmsleu + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 29: { + //vmsle + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } + } break; + case 30: { + //vmsgtu + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 31: { + //vmsgt + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } } break; - default: - std::abort(); } - } break; + break; + case 2: { + switch (func6) { + case 24: { + // vmandnot + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value & !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value & !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value & !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 25: { + // vmand + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 26: { + // vmor + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 27: { + //vmxor + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 28: { + //vmornot + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value | !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value | !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value | !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 29: { + //vmnand + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 30: { + //vmnor + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 31: { + //vmxnor + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 37: { + //vmul + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 45: { + // vmacc + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + } + } break; + case 6: { + switch (func6) { + case 0: { + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (rsdata[i][0] + second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (rsdata[i][0] + second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (rsdata[i][0] + second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 37: { + // vmul.vx + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (rsdata[i][0] * second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (rsdata[i][0] * second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (rsdata[i][0] * second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + } + } break; + case 7: { + vtype_.vill = 0; + vtype_.vediv = instr.getVediv(); + vtype_.vsew = instr.getVsew(); + vtype_.vlmul = instr.getVlmul(); + + D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX); + + int s0 = rsdata[0][0]; + if (s0 <= VLMAX) { + vl_ = s0; + } else if (s0 < (2 * VLMAX)) { + vl_ = (int)ceil((s0 * 1.0) / 2.0); + } else if (s0 >= (2 * VLMAX)) { + vl_ = VLMAX; + } + rddata[0] = vl_; + } break; default: std::abort(); } + } break; + default: + std::abort(); + } - if (rd_write) { - int rdt = instr.getRDType(); - switch (rdt) { - case 1: - if (rdest) { - D(2, "[" << std::dec << t << "] Dest Regs: r" << rdest << "=0x" << std::hex << std::hex << rddata); - iregs[rdest] = rddata; + if (rd_write) { + DPH(2, "Dest Reg: "); + int rdt = instr.getRDType(); + switch (rdt) { + case 1: + if (rdest) { + DPH(2, "r" << std::dec << rdest << "={"); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + iRegFile_.at(t)[rdest] = rddata[t]; + if (t) DPN(2, ", "); + DPN(2, "0x" << std::hex << rddata[t]); } - break; - case 2: - D(2, "[" << std::dec << t << "] Dest Regs: fr" << rdest << "=0x" << std::hex << std::hex << rddata); - fregs[rdest] = rddata; - break; - default: - break; + DPN(2, "}" << std::endl); + pipeline_state->used_iregs[rdest] = 1; } + break; + case 2: + DPH(2, "fr" << std::dec << rdest << "={"); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + fRegFile_.at(t)[rdest] = rddata[t]; + if (t) DPN(2, ", "); + DPN(2, "0x" << std::hex << rddata[t]); + } + DPN(2, "}" << std::endl); + pipeline_state->used_fregs[rdest] = 1; + break; + case 3: + pipeline_state->used_vregs[rdest] = 1; + break; + default: + break; } } diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp new file mode 100644 index 00000000..5cdf22f3 --- /dev/null +++ b/sim/simX/exeunit.cpp @@ -0,0 +1,152 @@ +#include "exeunit.h" +#include +#include +#include +#include +#include +#include "debug.h" +#include "core.h" + +using namespace vortex; + +LsuUnit::LsuUnit(Core* core) + : ExeUnit("LSU") + , core_(core) + , num_threads_(core->arch().num_threads()) + , pending_dcache_(LSUQ_SIZE) + , fence_lock_(false) +{} + +void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) { + auto entry = pending_dcache_.at(response.tag); + entry.second.reset(port_id); // track remaining blocks + if (!entry.second.any()) { + auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); + entry.first.dcache_latency = latency; + this->schedule_output(entry.first, 1); + pending_dcache_.release(response.tag); + } +} + +void LsuUnit::step() { + if (fence_lock_) { + // wait for all pending memory operations to complete + if (!pending_dcache_.empty()) + return; + this->schedule_output(fence_state_, 1); + fence_lock_ = false; + } + + if (inputs_.empty()) + return; + + auto state = inputs_.top(); + + if (state.lsu.fence) { + // schedule fence lock + fence_state_ = state; + fence_lock_ = true; + inputs_.pop(); + return; + } + + // send dcache requests + if (!pending_dcache_.full()) { + state.dcache_latency = SimPlatform::instance().cycles(); + auto tag = pending_dcache_.allocate({state, state.tmask}); + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!state.tmask.test(t)) + continue; + MemReq mem_req; + mem_req.addr = state.mem_addrs.at(t); + mem_req.write = state.lsu.store; + mem_req.tag = tag; + core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1); + } + inputs_.pop(); + } +} + +/////////////////////////////////////////////////////////////////////////////// + +AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} + +void AluUnit::step() { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + switch (state.alu.type) { + case AluType::ARITH: + this->schedule_output(state, 1); + break; + case AluType::BRANCH: + this->schedule_output(state, 1); + break; + case AluType::IMUL: + this->schedule_output(state, LATENCY_IMUL); + break; + case AluType::IDIV: + this->schedule_output(state, XLEN); + break; + } +} + +/////////////////////////////////////////////////////////////////////////////// + +CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} + +void CsrUnit::step() { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + this->schedule_output(state, 1); +} + +/////////////////////////////////////////////////////////////////////////////// + +FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} + +void FpuUnit::step() { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + switch (state.fpu.type) { + case FpuType::FNCP: + this->schedule_output(state, 1); + break; + case FpuType::FMA: + this->schedule_output(state, LATENCY_FMA); + break; + case FpuType::FDIV: + this->schedule_output(state, LATENCY_FDIV); + break; + case FpuType::FSQRT: + this->schedule_output(state, LATENCY_FSQRT); + break; + case FpuType::FCVT: + this->schedule_output(state, LATENCY_FCVT); + break; + } +} + +/////////////////////////////////////////////////////////////////////////////// + +GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {} + +void GpuUnit::step() { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + switch (state.gpu.type) { + case GpuType::TMC: + case GpuType::WSPAWN: + case GpuType::SPLIT: + case GpuType::JOIN: + case GpuType::BAR: + this->schedule_output(state, 1); + break; + case GpuType::TEX: + /* TODO */ + break; + } +} \ No newline at end of file diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h new file mode 100644 index 00000000..915089d3 --- /dev/null +++ b/sim/simX/exeunit.h @@ -0,0 +1,103 @@ +#pragma once + +#include +#include "pipeline.h" +#include "cache.h" + +namespace vortex { + +class Core; + +class ExeUnit { +protected: + const char* name_; + Queue inputs_; + Queue outputs_; + + void schedule_output(const pipeline_state_t& state, uint32_t delay) { + if (delay > 1) { + SimPlatform::instance().schedule( + [&](const pipeline_state_t& req) { + outputs_.push(req); + }, + state, + (delay - 1) + ); + } else { + outputs_.push(state); + } + } + +public: + typedef std::shared_ptr Ptr; + + ExeUnit(const char* name) : name_(name) {} + + virtual ~ExeUnit() {} + + void push_input(const pipeline_state_t& state) { + inputs_.push(state); + } + + bool pop_output(pipeline_state_t* state) { + return outputs_.try_pop(state); + } + + virtual void step() = 0; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class LsuUnit : public ExeUnit { +private: + Core* core_; + uint32_t num_threads_; + HashTable> pending_dcache_; + pipeline_state_t fence_state_; + bool fence_lock_; + +public: + LsuUnit(Core*); + + void handleCacheReponse(const MemRsp& response, uint32_t port_id); + + void step(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class AluUnit : public ExeUnit { +public: + AluUnit(Core*); + + void step(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class CsrUnit : public ExeUnit { +public: + CsrUnit(Core*); + + void step(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class FpuUnit : public ExeUnit { +public: + FpuUnit(Core*); + + void step(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class GpuUnit : public ExeUnit { +public: + GpuUnit(Core*); + + void step(); +}; + +} \ No newline at end of file diff --git a/sim/simX/ibuffer.h b/sim/simX/ibuffer.h new file mode 100644 index 00000000..86bdeed7 --- /dev/null +++ b/sim/simX/ibuffer.h @@ -0,0 +1,39 @@ +#pragma once + +#include "pipeline.h" +#include + +namespace vortex { + +class IBuffer { +private: + std::queue entries_; + uint32_t capacity_; + +public: + IBuffer(uint32_t size) + : capacity_(size) + {} + + bool empty() const { + return entries_.empty(); + } + + bool full() const { + return (entries_.size() == capacity_); + } + + const pipeline_state_t& top() const { + return entries_.front(); + } + + void push(const pipeline_state_t& state) { + entries_.emplace(state); + } + + void pop() { + return entries_.pop(); + } +}; + +} \ No newline at end of file diff --git a/sim/simX/instr.h b/sim/simX/instr.h index a93dd61b..1a205478 100644 --- a/sim/simX/instr.h +++ b/sim/simX/instr.h @@ -113,15 +113,12 @@ private: int num_rsrcs_; bool has_imm_; int rdest_type_; - int isrc_mask_; - int fsrc_mask_; - int vsrc_mask_; Word imm_; int rsrc_type_[MAX_REG_SOURCES]; int rsrc_[MAX_REG_SOURCES]; int rdest_; Word func3_; - Word func7_; + Word func6_; //Vector Word vmask_; @@ -132,7 +129,7 @@ private: Word vlmul_; Word vsew_; Word vediv_; - Word func6_; + Word func7_; friend std::ostream &operator<<(std::ostream &, const Instr&); }; diff --git a/sim/simX/main.cpp b/sim/simX/main.cpp index 9af8ff02..a34ada0e 100644 --- a/sim/simX/main.cpp +++ b/sim/simX/main.cpp @@ -5,28 +5,26 @@ #include #include #include - -#include "debug.h" -#include "types.h" -#include "core.h" +#include "processor.h" #include "args.h" using namespace vortex; int main(int argc, char **argv) { + int ret; - std::string archString("rv32imf"); + std::string archStr("rv32imf"); + std::string imgFileName; int num_cores(NUM_CORES * NUM_CLUSTERS); int num_warps(NUM_WARPS); - int num_threads(NUM_THREADS); - std::string imgFileName; + int num_threads(NUM_THREADS); bool showHelp(false); bool showStats(false); bool riscv_test(false); /* Read the command line arguments. */ CommandLineArgFlag fh("-h", "--help", "", showHelp); - CommandLineArgSetter fa("-a", "--arch", "", archString); + CommandLineArgSetter fa("-a", "--arch", "", archStr); CommandLineArgSetter fi("-i", "--image", "", imgFileName); CommandLineArgSetter fc("-c", "--cores", "", num_cores); CommandLineArgSetter fw("-w", "--warps", "", num_warps); @@ -48,62 +46,18 @@ int main(int argc, char **argv) { return 0; } - ArchDef arch(archString, num_cores, num_warps, num_threads); - - Decoder decoder(arch); - MemoryUnit mu(0, arch.wsize(), true); + std::cout << "Running " << imgFileName << "..." << std::endl; - RAM ram((1<<12), (1<<20)); - - std::string program_ext(fileExtension(imgFileName.c_str())); - if (program_ext == "bin") { - ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR); - } else if (program_ext == "hex") { - ram.loadHexImage(imgFileName.c_str()); - } else { - std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; + if (!SimPlatform::instance().initialize()) return -1; - } - mu.attach(ram, 0, 0xFFFFFFFF); + { + ArchDef arch(archStr, num_cores, num_warps, num_threads); + Processor processor(arch); + ret = processor.run(imgFileName, riscv_test, showStats); + } - struct stat hello; - fstat(0, &hello); + SimPlatform::instance().finalize(); - std::vector> cores(num_cores); - for (int i = 0; i < num_cores; ++i) { - cores[i] = std::make_shared(arch, decoder, mu, i); - } - - bool running; - int exitcode = 0; - do { - running = false; - for (auto& core : cores) { - core->step(); - if (core->running()) { - running = true; - } - if (core->check_ebreak()) { - exitcode = core->getIRegValue(3); - running = false; - break; - } - } - } while (running); - - if (riscv_test) { - if (1 == exitcode) { - std::cout << "Passed." << std::endl; - exitcode = 0; - } else { - std::cout << "Failed." << std::endl; - } - } else { - if (exitcode != 0) { - std::cout << "*** error: exitcode=" << exitcode << std::endl; - } - } - - return exitcode; + return ret; } diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp new file mode 100644 index 00000000..c377972d --- /dev/null +++ b/sim/simX/memsim.cpp @@ -0,0 +1,58 @@ +#include "memsim.h" +#include +#include +#include "constants.h" + +using namespace vortex; + +class MemSim::Impl { +private: + MemSim* simobject_; + std::vector> inputs_; + uint32_t latency_; + +public: + Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) + : simobject_(simobject) + , inputs_(num_banks) + , latency_(latency) + {} + + void handleMemRequest(const MemReq& mem_req, uint32_t port_id) { + inputs_.at(port_id).push(mem_req); + } + + void step(uint64_t /*cycle*/) { + for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) { + auto& queue = inputs_.at(i); + if (queue.empty()) + continue; + auto& entry = queue.front(); + if (!entry.write) { + MemRsp mem_rsp; + mem_rsp.tag = entry.tag; + simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); + } + queue.pop(); + } + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +MemSim::MemSim(const SimContext& ctx, + uint32_t num_banks, + uint32_t latency) + : SimObject(ctx, "MemSim") + , impl_(new Impl(this, num_banks, latency)) + , MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest}) + , MemRspPorts(num_banks, this) +{} + +MemSim::~MemSim() { + delete impl_; +} + +void MemSim::step(uint64_t cycle) { + impl_->step(cycle); +} \ No newline at end of file diff --git a/sim/simX/memsim.h b/sim/simX/memsim.h new file mode 100644 index 00000000..24d8e6ca --- /dev/null +++ b/sim/simX/memsim.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +namespace vortex { + +struct MemReq { + uint64_t addr; + uint32_t tag; + bool write; +}; + +struct MemRsp { + uint32_t tag; +}; + +class MemSim : public SimObject{ +private: + class Impl; + Impl* impl_; + +public: + + MemSim(const SimContext& ctx, uint32_t num_inputs, uint32_t latency); + ~MemSim(); + + void step(uint64_t cycle); + + std::vector> MemReqPorts; + std::vector> MemRspPorts; +}; + +}; \ No newline at end of file diff --git a/sim/simX/pipeline.cpp b/sim/simX/pipeline.cpp deleted file mode 100644 index c54977a0..00000000 --- a/sim/simX/pipeline.cpp +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include "pipeline.h" - -using namespace vortex; - -namespace vortex { -std::ostream &operator<<(std::ostream &os, const Pipeline& pipeline) { - os << pipeline.name_ << ": valid=" << pipeline.valid << std::endl; - os << pipeline.name_ << ": stalled=" << pipeline.stalled << std::endl; - os << pipeline.name_ << ": stall_warp=" << pipeline.stall_warp << std::endl; - os << pipeline.name_ << ": wid=" << pipeline.wid << std::endl; - os << pipeline.name_ << ": PC=" << std::hex << pipeline.PC << std::endl; - os << pipeline.name_ << ": used_iregs=" << pipeline.used_iregs << std::endl; - os << pipeline.name_ << ": used_fregs=" << pipeline.used_fregs << std::endl; - os << pipeline.name_ << ": used_vregs=" << pipeline.used_vregs << std::endl; - return os; -} -} - -Pipeline::Pipeline(const char* name) -: name_(name) { - this->clear(); -} - -void Pipeline::clear() { - valid = false; - stalled = false; - stall_warp = false; - wid = 0; - PC = 0; - used_iregs.reset(); - used_fregs.reset(); - used_vregs.reset(); -} - -bool Pipeline::enter(Pipeline *drain) { - if (drain) { - if (drain->stalled) { - this->stalled = true; - return false; - } - drain->valid = false; - } - this->stalled = false; - if (!this->valid) - return false; - return true; -} - -void Pipeline::next(Pipeline *drain) { - if (drain) { - drain->valid = this->valid; - drain->stalled = this->stalled; - drain->stall_warp = this->stall_warp; - drain->wid = this->wid; - drain->PC = this->PC; - drain->rdest = this->rdest; - drain->rdest_type = this->rdest_type; - drain->used_iregs = this->used_iregs; - drain->used_fregs = this->used_fregs; - drain->used_vregs = this->used_vregs; - } -} \ No newline at end of file diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h index f8899a63..82735c2a 100644 --- a/sim/simX/pipeline.h +++ b/sim/simX/pipeline.h @@ -2,47 +2,75 @@ #pragma once #include +#include #include #include "types.h" #include "debug.h" namespace vortex { -class Instr; - -class Pipeline { -public: - Pipeline(const char* name); - - void clear(); - - bool enter(Pipeline* drain); - - void next(Pipeline* drain); - - //-- - bool valid; - - //-- - bool stalled; - bool stall_warp; - +struct pipeline_state_t { //-- - int wid; - Word PC; + int wid; + ThreadMask tmask; + Word PC; //-- - int rdest_type; - int rdest; - RegMask used_iregs; - RegMask used_fregs; - RegMask used_vregs; + bool stall_warp; + int rdest_type; + int rdest; + RegMask used_iregs; + RegMask used_fregs; + RegMask used_vregs; -private: + //- + ExeType exe_type; + std::vector mem_addrs; + + //-- + union { + struct { + uint8_t load : 1; + uint8_t store: 1; + uint8_t fence : 1; + uint8_t prefetch: 1; + } lsu; + struct { + AluType type; + } alu; + struct { + FpuType type; + } fpu; + struct { + GpuType type; + } gpu; + }; + // stats + uint64_t icache_latency; + uint64_t dcache_latency; +}; + +class PipelineStage : public Queue { +protected: const char* name_; + friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&); - friend std::ostream &operator<<(std::ostream &, const Pipeline&); -}; +public: + PipelineStage(const char* name = nullptr) + : name_(name) + {} +}; + +inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { + os << "stall_warp=" << state.stall_warp; + os << ", wid=" << state.wid; + os << ", PC=" << std::hex << state.PC; + os << ", used_iregs=" << state.used_iregs; + os << ", used_fregs=" << state.used_fregs; + os << ", used_vregs=" << state.used_vregs; + os << std::endl; + return os; +} } \ No newline at end of file diff --git a/sim/simX/processor.h b/sim/simX/processor.h new file mode 100644 index 00000000..50671953 --- /dev/null +++ b/sim/simX/processor.h @@ -0,0 +1,189 @@ +#pragma once + +#include "constants.h" +#include "debug.h" +#include "types.h" +#include "core.h" + +namespace vortex { + +class Processor { +private: + ArchDef arch_; + Decoder decoder_; + MemoryUnit mu_; + RAM ram_; + std::vector cores_; + std::vector l2caches_; + std::vector::Ptr> l2_mem_switches_; + Cache::Ptr l3cache_; + Switch::Ptr l3_mem_switch_; + MemSim::Ptr memsim_; + +public: + Processor(const ArchDef& arch) + : arch_(arch) + , decoder_(arch) + , mu_(0, arch.wsize(), true) + , ram_((1<<12), (1<<20)) + , cores_(arch.num_cores()) + , l2caches_(NUM_CLUSTERS) + , l2_mem_switches_(NUM_CLUSTERS) + { + uint32_t num_cores = arch.num_cores(); + uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; + + // bind RAM to memory unit + mu_.attach(ram_, 0, 0xFFFFFFFF); + + // create cores + for (uint32_t i = 0; i < num_cores; ++i) { + cores_.at(i) = Core::Create(arch, decoder_, mu_, i); + } + + // connect memory sub-systen + memsim_ = MemSim::Create(1, MEM_LATENCY); + std::vector*> mem_req_ports(1); + std::vector*> mem_rsp_ports(1); + mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); + mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); + + if (L3_ENABLE) { + l3cache_ = Cache::Create("l3cache", CacheConfig{ + log2ceil(L3_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L3_NUM_BANKS, // number of banks + L3_NUM_PORTS, // number of ports + NUM_CLUSTERS, // request size + true, // write-throught + 0, // victim size + L3_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); + l3cache_->MemReqPort.bind(mem_req_ports.at(0)); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); + mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); + } + } else if (NUM_CLUSTERS > 1) { + l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); + mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); + l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); + mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); + } + } + + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + if (L2_ENABLE) { + auto& l2cache = l2caches_.at(i); + l2cache = Cache::Create("l2cache", CacheConfig{ + log2ceil(L2_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L2_NUM_BANKS, // number of banks + L2_NUM_PORTS, // number of ports + NUM_CORES, // request size + true, // write-throught + 0, // victim size + L2_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); + l2cache->MemReqPort.bind(mem_req_ports.at(i)); + + mem_req_ports.resize(cores_per_cluster); + mem_rsp_ports.resize(cores_per_cluster); + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); + mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + } + } else if (cores_per_cluster > 1) { + auto& l2_mem_switch = l2_mem_switches_.at(i); + l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES); + mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); + l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + + mem_req_ports.resize(cores_per_cluster); + mem_rsp_ports.resize(cores_per_cluster); + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); + mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + } + } + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + auto& core = cores_.at((i * NUM_CLUSTERS) + j); + mem_rsp_ports.at(i)->bind(&core->MemRspPort); + core->MemReqPort.bind(mem_req_ports.at(j)); + } + } + } + + ~Processor() {} + + int run(const std::string& program, bool riscv_test, bool /*showStats*/) { + { + std::string program_ext(fileExtension(program.c_str())); + if (program_ext == "bin") { + ram_.loadBinImage(program.c_str(), STARTUP_ADDR); + } else if (program_ext == "hex") { + ram_.loadHexImage(program.c_str()); + } else { + std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; + return -1; + } + } + + bool running; + int exitcode = 0; + do { + SimPlatform::instance().step(); + + running = false; + for (auto& core : cores_) { + if (core->running()) { + running = true; + } + if (core->check_ebreak()) { + exitcode = core->getIRegValue(3); + running = false; + break; + } + } + } while (running); + + // get error status + + if (riscv_test) { + if (1 == exitcode) { + std::cout << "Passed." << std::endl; + exitcode = 0; + } else { + std::cout << "Failed." << std::endl; + } + } else { + if (exitcode != 0) { + std::cout << "*** error: exitcode=" << exitcode << std::endl; + } + } + + return exitcode; + } + +}; + +} \ No newline at end of file diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h new file mode 100644 index 00000000..0e0e0577 --- /dev/null +++ b/sim/simX/scoreboard.h @@ -0,0 +1,71 @@ +#pragma once + +#include "pipeline.h" +#include + +namespace vortex { + +class Scoreboard { +private: + std::vector in_use_iregs_; + std::vector in_use_fregs_; + std::vector in_use_vregs_; + +public: + Scoreboard(const ArchDef &arch) + : in_use_iregs_(arch.num_warps()) + , in_use_fregs_(arch.num_warps()) + , in_use_vregs_(arch.num_warps()) + { + for (int w = 0; w < arch.num_warps(); ++w) { + in_use_iregs_.at(w).reset(); + in_use_fregs_.at(w).reset(); + in_use_vregs_.at(w).reset(); + } + } + + bool in_use(const pipeline_state_t& state) const { + return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0 + || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0 + || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0; + } + + void reserve(const pipeline_state_t& state) { + if (!state.rdest) + return; + + switch (state.rdest_type) { + case 1: + in_use_iregs_.at(state.wid).set(state.rdest); + break; + case 2: + in_use_fregs_.at(state.wid).set(state.rdest); + break; + case 3: + in_use_vregs_.at(state.wid).set(state.rdest); + break; + default: + break; + } + } + + void release(const pipeline_state_t& state) { + if (!state.rdest) + return; + switch (state.rdest_type) { + case 1: + in_use_iregs_.at(state.wid).reset(state.rdest); + break; + case 2: + in_use_fregs_.at(state.wid).reset(state.rdest); + break; + case 3: + in_use_vregs_.at(state.wid).reset(state.rdest); + break; + default: + break; + } + } +}; + +} \ No newline at end of file diff --git a/sim/simX/types.h b/sim/simX/types.h index ca732040..3dabfe3e 100644 --- a/sim/simX/types.h +++ b/sim/simX/types.h @@ -2,7 +2,10 @@ #include #include +#include +#include #include +#include namespace vortex { @@ -14,9 +17,242 @@ typedef uint32_t Addr; typedef uint32_t Size; typedef std::bitset<32> RegMask; - typedef std::bitset<32> ThreadMask; - typedef std::bitset<32> WarpMask; +enum class ExeType { + ALU, + LSU, + CSR, + FPU, + GPU, + MAX, +}; + +enum class AluType { + ARITH, + BRANCH, + IMUL, + IDIV, +}; + +enum class FpuType { + FNCP, + FMA, + FDIV, + FSQRT, + FCVT, +}; + +enum class GpuType { + TMC, + WSPAWN, + SPLIT, + JOIN, + BAR, + TEX, +}; + +enum class ArbiterType { + Priority, + RoundRobin +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class Queue { +protected: + std::queue queue_; + +public: + Queue() {} + + bool empty() const { + return queue_.empty(); + } + + const T& top() const { + return queue_.front(); + } + + void push(const T& value) { + queue_.push(value); + } + + void pop() { + queue_.pop(); + } + + bool try_pop(T* value) { + if (queue_.empty()) + return false; + *value = queue_.front(); + queue_.pop(); + return true; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class HashTable { +private: + std::vector> entries_; + uint32_t capacity_; + +public: + HashTable(uint32_t size) + : entries_(size) + , capacity_(0) + {} + + bool empty() const { + return (0 == capacity_); + } + + bool full() const { + return (capacity_ == entries_.size()); + } + + bool contains(uint32_t index) const { + return entries_.at(index).first; + } + + const T& at(uint32_t index) const { + auto& entry = entries_.at(index); + assert(entry.first); + return entry.second; + } + + T& at(uint32_t index) { + auto& entry = entries_.at(index); + assert(entry.first); + return entry.second; + } + + uint32_t allocate(const T& value) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (!entry.first) { + entry.first = true; + entry.second = value; + ++capacity_; + return i; + } + } + return -1; + } + + void release(uint32_t index) { + auto& entry = entries_.at(index); + assert(entry.first); + entry.first = false; + } + + void remove(uint32_t index, T* value) { + auto& entry = entries_.at(index); + assert(entry.first); + *value = entry.second; + entry.first = false; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class Switch : public SimObject> { +private: + struct req_t { + std::vector data; + std::bitset valid; + req_t() {} + req_t(uint32_t size) : data(size) {} + }; + + void handleIncomingRequest(const Req& req, uint32_t port_id) { + cur_req_.data.at(port_id) = req; + cur_req_.valid.set(port_id); + } + + void handleIncomingResponse(const Rsp& rsp, uint32_t) { + rsps_.push(rsp); + } + + ArbiterType type_; + std::queue reqs_; + std::queue rsps_; + req_t cur_req_; + uint32_t delay_; + uint32_t cursor_; + std::unordered_map addr_table_; + +public: + Switch( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t delay = 1 + ) + : SimObject>(ctx, name) + , type_(type) + , cur_req_(num_inputs) + , delay_(delay) + , cursor_(0) + , ReqIn(num_inputs, {this, this, &Switch::handleIncomingRequest}) + , ReqOut(this) + , RspIn(this, this, &Switch::handleIncomingResponse) + , RspOut(num_inputs, this) + { + assert(delay_ != 0); + assert(num_inputs <= MaxInputs); + } + + void step(uint64_t /*cycle*/) { + if (cur_req_.valid.any()) { + reqs_.push(cur_req_); + cur_req_.valid.reset(); + } + + while (!reqs_.empty()) { + auto& entry = reqs_.front(); + bool found = false; + for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) { + auto j = (cursor_ + i) % n; + if (entry.valid.test(j)) { + auto& req = entry.data.at(j); + addr_table_[req.tag] = j; + ReqOut.send(req, delay_); + entry.valid.reset(j); + this->update_cursor(j); + found = true; + break; + } + } + if (found) + break; + reqs_.pop(); + } + + if (!rsps_.empty()) { + auto& rsp = rsps_.front(); + auto port_id = addr_table_.at(rsp.tag); + RspOut.at(port_id).send(rsp, 1); + rsps_.pop(); + } + } + + void update_cursor(uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + cursor_ = grant + 1; + } + } + + std::vector> ReqIn; + MasterPort ReqOut; + SlavePort RspIn; + std::vector> RspOut; +}; + } \ No newline at end of file diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp index a505fe5c..0c989d0c 100644 --- a/sim/simX/warp.cpp +++ b/sim/simX/warp.cpp @@ -12,25 +12,21 @@ using namespace vortex; Warp::Warp(Core *core, Word id) : id_(id) - , core_(core) { + , core_(core) + , active_(false) + , PC_(STARTUP_ADDR) + , tmask_(0) { iRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); fRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); vRegFile_.resize(core_->arch().num_regs(), std::vector(core_->arch().vsize(), 0)); - this->clear(); } -void Warp::clear() { - PC_ = STARTUP_ADDR; - tmask_.reset(); - active_ = false; -} - -void Warp::step(Pipeline *pipeline) { +void Warp::eval(pipeline_state_t *pipeline_state) { assert(tmask_.any()); DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask="); for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) - DPN(2, tmask_[n-i-1]); + DPN(2, tmask_.test(n-i-1)); DPN(2, "\n"); /* Fetch and decode. */ @@ -38,55 +34,24 @@ void Warp::step(Pipeline *pipeline) { Word fetched = core_->icache_fetch(PC_); auto instr = core_->decoder().decode(fetched, PC_); - // Update pipeline - pipeline->valid = true; - pipeline->PC = PC_; - pipeline->rdest = instr->getRDest(); - pipeline->rdest_type = instr->getRDType(); - pipeline->used_iregs.reset(); - pipeline->used_fregs.reset(); - pipeline->used_vregs.reset(); - - switch (pipeline->rdest_type) { - case 1: - pipeline->used_iregs[pipeline->rdest] = 1; - break; - case 2: - pipeline->used_fregs[pipeline->rdest] = 1; - break; - case 3: - pipeline->used_vregs[pipeline->rdest] = 1; - break; - default: - break; - } - - for (int i = 0; i < instr->getNRSrc(); ++i) { - int type = instr->getRSType(i); - int reg = instr->getRSrc(i); - switch (type) { - case 1: - pipeline->used_iregs[reg] = 1; - break; - case 2: - pipeline->used_fregs[reg] = 1; - break; - case 3: - pipeline->used_vregs[reg] = 1; - break; - default: - break; - } - } + // Update state + pipeline_state->wid = id_; + pipeline_state->PC = PC_; + pipeline_state->tmask = tmask_; + pipeline_state->rdest = instr->getRDest(); + pipeline_state->rdest_type = instr->getRDType(); + pipeline_state->used_iregs.reset(); + pipeline_state->used_fregs.reset(); + pipeline_state->used_vregs.reset(); // Execute - this->execute(*instr, pipeline); + this->execute(*instr, pipeline_state); D(4, "Register state:"); for (int i = 0; i < core_->arch().num_regs(); ++i) { DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); for (int j = 0; j < core_->arch().num_threads(); ++j) { - DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_[j][i] << std::setfill(' ') << ' '); + DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' '); } DPN(4, std::endl); } diff --git a/sim/simX/warp.h b/sim/simX/warp.h index 7473d858..99b372ca 100644 --- a/sim/simX/warp.h +++ b/sim/simX/warp.h @@ -9,7 +9,7 @@ namespace vortex { class Core; class Instr; -class Pipeline; +class pipeline_state_t; struct DomStackEntry { DomStackEntry(const ThreadMask &tmask, Word PC) : tmask(tmask) @@ -41,8 +41,6 @@ struct vtype { class Warp { public: Warp(Core *core, Word id); - - void clear(); bool active() const { return active_; @@ -71,7 +69,7 @@ public: } void setTmask(size_t index, bool value) { - tmask_[index] = value; + tmask_.set(index, value); active_ = tmask_.any(); } @@ -82,18 +80,18 @@ public: } Word getIRegValue(int reg) const { - return iRegFile_[0][reg]; + return iRegFile_.at(0).at(reg); } - void step(Pipeline *); + void eval(pipeline_state_t *); private: - void execute(const Instr &instr, Pipeline *); + void execute(const Instr &instr, pipeline_state_t *pipeline_state); Word id_; - bool active_; Core *core_; + bool active_; Word PC_; ThreadMask tmask_;