cummulative fixes, RTL uuid trace, texture unit fixes, simx timing fixes

This commit is contained in:
Blaise Tine
2021-11-30 07:08:15 -05:00
parent b995843a5b
commit 41d7e6c63a
79 changed files with 2148 additions and 1372 deletions

47
sim/common/mempool.h Normal file
View File

@@ -0,0 +1,47 @@
#pragma once
#include <stack>
template <typename T>
class MemoryPool {
public:
MemoryPool(uint32_t max_size) : max_size_(max_size) {}
MemoryPool(MemoryPool && other)
: free_list_(std::move(other.free_list_))
{}
~MemoryPool() {
this->flush();
}
void* allocate() {
void* mem;
if (!free_list_.empty()) {
mem = static_cast<void*>(free_list_.top());
free_list_.pop();
} else {
mem = ::operator new(sizeof(T));
}
return mem;
}
void deallocate(void * object) {
if (free_list_.size() < max_size_) {
free_list_.push(static_cast<T*>(object));
} else {
::operator delete(object);
}
}
void flush() {
while (!free_list_.empty()) {
::operator delete(free_list_.top());
free_list_.pop();
}
}
private:
std::stack<void*> free_list_;
uint32_t max_size_;
};

View File

@@ -7,6 +7,7 @@
#include <list>
#include <queue>
#include <assert.h>
#include "mempool.h"
class SimObjectBase;
@@ -20,37 +21,14 @@ public:
return module_;
}
SimPortBase* peer() const {
return peer_;
}
bool connected() const {
return (peer_ != nullptr);
}
protected:
SimPortBase(SimObjectBase* module)
: module_(module)
, peer_(nullptr)
{}
void connect(SimPortBase* peer) {
assert(peer_ == nullptr);
peer_ = peer;
}
void disconnect() {
assert(peer_ == nullptr);
peer_ = nullptr;
}
SimPortBase& operator=(const SimPortBase&) = delete;
SimObjectBase* module_;
SimPortBase* peer_;
template <typename U> friend class SlavePort;
template <typename U> friend class MasterPort;
};
///////////////////////////////////////////////////////////////////////////////
@@ -58,72 +36,92 @@ protected:
template <typename Pkt>
class SimPort : public SimPortBase {
public:
void send(const Pkt& pkt, uint64_t delay) const;
typedef std::function<void (const Pkt&, uint64_t)> TxCallback;
SimPort(SimObjectBase* module)
: SimPortBase(module)
, peer_(nullptr)
, tx_cb_(nullptr)
{}
void send(const Pkt& pkt, uint64_t delay = 1) const;
void bind(SimPort<Pkt>* peer) {
this->connect(peer);
assert(peer_ == nullptr);
peer_ = peer;
}
void unbind() {
this->disconnect();
assert(peer_ == nullptr);
peer_ = nullptr;
}
bool connected() const {
return (peer_ != nullptr);
}
SimPort* peer() const {
return peer_;
}
bool empty() const {
return queue_.empty();
}
const Pkt& top() const {
const Pkt& front() const {
return queue_.front();
}
Pkt& top() {
return queue_.front();
Pkt& front() {
return queue_.front().pkt;
}
void pop() {
const Pkt& back() const {
return queue_.back();
}
Pkt& back() {
return queue_.back().pkt;
}
uint64_t pop() {
auto cycle = queue_.front().cycle;
queue_.pop();
}
return cycle;
}
void tx_callback(const TxCallback& callback) {
tx_cb_ = callback;
}
protected:
SimPort(SimObjectBase* module)
: SimPortBase(module)
{}
struct timed_pkt_t {
Pkt pkt;
uint64_t cycle;
};
void push(const Pkt& data) {
queue_.push(data);
std::queue<timed_pkt_t> queue_;
SimPort* peer_;
TxCallback tx_cb_;
void push(const Pkt& data, uint64_t cycle) {
if (tx_cb_) {
tx_cb_(data, cycle);
}
if (peer_) {
peer_->push(data, cycle);
} else {
queue_.push({data, cycle});
}
}
SimPort& operator=(const SimPort&) = delete;
std::queue<Pkt> queue_;
template <typename U> friend class SimPortEvent;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class SlavePort : public SimPort<Pkt> {
public:
SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
protected:
SlavePort& operator=(const SlavePort&) = delete;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class MasterPort : public SimPort<Pkt> {
public:
MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}
protected:
MasterPort& operator=(const MasterPort&) = delete;
};
///////////////////////////////////////////////////////////////////////////////
class SimEventBase {
public:
typedef std::shared_ptr<SimEventBase> Ptr;
@@ -132,14 +130,14 @@ public:
virtual void fire() const = 0;
bool step() {
return (0 == --delay_);
uint64_t time() const {
return time_;
}
protected:
SimEventBase(uint64_t delay) : delay_(delay) {}
SimEventBase(uint64_t time) : time_(time) {}
uint64_t delay_;
uint64_t time_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -147,26 +145,34 @@ protected:
template <typename Pkt>
class SimCallEvent : public SimEventBase {
public:
typedef std::function<void (const Pkt&)> Func;
template <typename... Args>
static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) {
return std::make_shared<SimCallEvent>(func, pkt, delay);
}
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay)
: SimEventBase(delay)
, func_(func)
, pkt_(pkt)
{}
void fire() const override {
func_(pkt_);
}
protected:
typedef std::function<void (const Pkt&)> Func;
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t time)
: SimEventBase(time)
, func_(func)
, pkt_(pkt)
{}
void* operator new(size_t /*size*/) {
return allocator().allocate();
}
void operator delete(void* ptr) {
allocator().deallocate(ptr);
}
protected:
Func func_;
Pkt pkt_;
Pkt pkt_;
static MemoryPool<SimCallEvent<Pkt>>& allocator() {
static MemoryPool<SimCallEvent<Pkt>> instance(64);
return instance;
}
};
///////////////////////////////////////////////////////////////////////////////
@@ -174,23 +180,32 @@ protected:
template <typename Pkt>
class SimPortEvent : public SimEventBase {
public:
static Ptr Create(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) {
return std::make_shared<SimPortEvent>(port, pkt, delay);
void fire() const override {
const_cast<SimPort<Pkt>*>(port_)->push(pkt_, time_);
}
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay)
: SimEventBase(delay)
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t time)
: SimEventBase(time)
, port_(port)
, pkt_(pkt)
{}
void fire() const override {
const_cast<SimPort<Pkt>*>(port_)->push(pkt_);
void* operator new(size_t /*size*/) {
return allocator().allocate();
}
private:
void operator delete(void* ptr) {
allocator().deallocate(ptr);
}
protected:
const SimPort<Pkt>* port_;
Pkt pkt_;
static MemoryPool<SimPortEvent<Pkt>>& allocator() {
static MemoryPool<SimPortEvent<Pkt>> instance(64);
return instance;
}
};
///////////////////////////////////////////////////////////////////////////////
@@ -203,24 +218,17 @@ public:
virtual ~SimObjectBase() {}
template <typename T, typename Pkt>
void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay);
const std::string& name() const {
return name_;
}
protected:
virtual void step(uint64_t cycle) = 0;
SimObjectBase(const SimContext& ctx, const char* name);
protected:
SimObjectBase(const SimContext& ctx, const char* name);
private:
std::string name_;
friend class SimPlatform;
friend class SimPortBase;
};
///////////////////////////////////////////////////////////////////////////////
@@ -228,14 +236,16 @@ private:
template <typename Impl>
class SimObject : public SimObjectBase {
public:
typedef std::shared_ptr<Impl> Ptr;
typedef std::shared_ptr<Impl> Ptr;
template <typename... Args>
static Ptr Create(Args&&... args);
protected:
SimObject(const SimContext& ctx, const char* name) : SimObjectBase(ctx, name) {}
SimObject(const SimContext& ctx, const char* name)
: SimObjectBase(ctx, name)
{}
void step(uint64_t cycle) override {
this->impl().step(cycle);
@@ -255,8 +265,8 @@ private:
class SimContext {
private:
SimContext() {}
template <typename Impl> template <typename... Args>
friend typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args);
friend class SimPlatform;
};
///////////////////////////////////////////////////////////////////////////////
@@ -281,25 +291,19 @@ public:
instance().clear();
}
void register_object(const SimObjectBase::Ptr& obj) {
template <typename Impl, typename... Args>
typename SimObject<Impl>::Ptr CreateObject(Args&&... args) {
auto obj = std::make_shared<Impl>(SimContext{}, std::forward<Args>(args)...);
objects_.push_back(obj);
return obj;
}
template <typename Pkt>
void schedule(const typename SimCallEvent<Pkt>::Func& callback,
void schedule(const typename SimCallEvent<Pkt>::Func& callback,
const Pkt& pkt,
uint64_t delay) {
auto evt = SimCallEvent<Pkt>::Create(callback, pkt, delay);
assert(delay != 0);
events_.emplace_back(evt);
}
template <typename Pkt>
void schedule(const SimPort<Pkt>* port,
const Pkt& pkt,
uint64_t delay) {
auto evt = SimPortEvent<Pkt>::Create(port, pkt, delay);
assert(delay != 0);
auto evt = std::make_shared<SimCallEvent<Pkt>>(callback, pkt, cycles_ + delay);
events_.emplace_back(evt);
}
@@ -309,7 +313,7 @@ public:
auto evt_it_end = events_.end();
while (evt_it != evt_it_end) {
auto& event = *evt_it;
if (event->step()) {
if (cycles_ >= event->time()) {
event->fire();
evt_it = events_.erase(evt_it);
} else {
@@ -341,9 +345,19 @@ private:
events_.clear();
}
template <typename Pkt>
void schedule(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) {
assert(delay != 0);
auto evt = SimEventBase::Ptr(new SimPortEvent<Pkt>(port, pkt, cycles_ + delay));
events_.emplace_back(evt);
}
std::vector<SimObjectBase::Ptr> objects_;
std::list<SimEventBase::Ptr> events_;
uint64_t cycles_;
template <typename U> friend class SimPort;
friend class SimObjectBase;
};
///////////////////////////////////////////////////////////////////////////////
@@ -355,22 +369,14 @@ inline SimObjectBase::SimObjectBase(const SimContext&, const char* name)
template <typename Impl>
template <typename... Args>
typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
auto obj = std::make_shared<Impl>(SimContext{}, std::forward<Args>(args)...);
SimPlatform::instance().register_object(obj);
return obj;
return SimPlatform::instance().CreateObject<Impl>(std::forward<Args>(args)...);
}
template <typename Pkt>
void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
if (peer_) {
if (peer_ && !tx_cb_) {
reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);
} else {
SimPlatform::instance().schedule(this, pkt, delay);
}
}
template <typename T, typename Pkt>
void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) {
auto callback = std::bind(entry, obj, std::placeholders::_1);
SimPlatform::instance().schedule(callback, pkt, delay);
}

View File

@@ -11,20 +11,20 @@ namespace vortex {
class ArchDef {
private:
int num_cores_;
int num_warps_;
int num_threads_;
int wsize_;
int vsize_;
int num_regs_;
int num_csrs_;
int num_barriers_;
uint16_t num_cores_;
uint16_t num_warps_;
uint16_t num_threads_;
uint16_t wsize_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
public:
ArchDef(const std::string& /*arch*/,
int num_cores,
int num_warps,
int num_threads)
uint16_t num_cores,
uint16_t num_warps,
uint16_t num_threads)
: num_cores_(num_cores)
, num_warps_(num_warps)
, num_threads_(num_threads)
@@ -35,35 +35,35 @@ public:
, num_barriers_(NUM_BARRIERS)
{}
int wsize() const {
uint16_t wsize() const {
return wsize_;
}
int vsize() const {
uint16_t vsize() const {
return vsize_;
}
int num_regs() const {
uint16_t num_regs() const {
return num_regs_;
}
int num_csrs() const {
uint16_t num_csrs() const {
return num_csrs_;
}
int num_barriers() const {
uint16_t num_barriers() const {
return num_barriers_;
}
int num_threads() const {
uint16_t num_threads() const {
return num_threads_;
}
int num_warps() const {
uint16_t num_warps() const {
return num_warps_;
}
int num_cores() const {
uint16_t num_cores() const {
return num_cores_;
}
};

View File

@@ -35,7 +35,7 @@ public:
CommandLineArg(l, ht), arg_(x) {}
int read(int argc, char **argv) {
__unused(argc);
__unused (argc);
std::istringstream iss(argv[1]);
iss >> arg_;
return 1;
@@ -53,7 +53,7 @@ public:
CommandLineArg(l, ht), arg_(x) { arg_ = false; }
int read(int argc, char **argv) {
__unused(argc, argv);
__unused (argc, argv);
arg_ = true;
return 0;
}

View File

@@ -27,7 +27,7 @@ struct params_t {
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const CacheConfig& config) {
params_t(const Cache::Config& config) {
uint32_t bank_bits = log2ceil(config.num_banks);
uint32_t offset_bits = config.B - config.W;
uint32_t log2_bank_size = config.C - bank_bits;
@@ -214,7 +214,7 @@ struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const CacheConfig& config,
bank_t(const Cache::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.blocks_per_set)
, mshr(config.mshr_size)
@@ -226,22 +226,30 @@ struct bank_t {
class Cache::Impl {
private:
Cache* const simobject_;
CacheConfig config_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr mem_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<MasterPort<MemReq>> mem_req_ports_;
std::vector<SlavePort<MemRsp>> mem_rsp_ports_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
uint32_t flush_cycles_;
public:
Impl(Cache* simobject, const CacheConfig& config)
Impl(Cache* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
, pending_read_reqs_(0)
, pending_write_reqs_(0)
, pending_fill_reqs_(0)
{
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
@@ -259,13 +267,29 @@ public:
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate tag flush cycles
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
void step(uint64_t /*cycle*/) {
void step(uint64_t cycle) {
// wait on flush cycles
if (flush_cycles_ != 0) {
--flush_cycles_;
return;
}
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
// handle bypasss responses
auto& bypass_port = bypass_switch_->RspOut.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.top();
auto& mem_rsp = bypass_port.front();
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp(tag);
@@ -287,7 +311,7 @@ public:
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (!mem_rsp_port.empty()) {
auto& mem_rsp = mem_rsp_port.top();
auto& mem_rsp = mem_rsp_port.front();
this->processMemoryFill(bank_id, mem_rsp.tag);
pending_fill_req.at(bank_id) = true;
mem_rsp_port.pop();
@@ -300,7 +324,7 @@ public:
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.top();
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.is_io) {
@@ -345,7 +369,7 @@ public:
// check MSHR capacity if read or writeback
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
// stall
++perf_stats_.mshr_stalls;
continue;
}
@@ -356,7 +380,7 @@ public:
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.infos[port_id].valid) {
// stall
++perf_stats_.bank_stalls;
continue;
}
// update pending request infos
@@ -365,8 +389,15 @@ public:
// schedule new request
pipeline_req = bank_req;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
core_req_port.pop();
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (cycle - time);
}
// process active request
@@ -393,6 +424,7 @@ public:
auto& block = set.blocks.at(entry.block_id);
block.valid = true;
block.tag = entry.tag;
--pending_fill_reqs_;
}
void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
@@ -438,7 +470,7 @@ public:
if (hit) {
//
// MISS handling
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
@@ -462,8 +494,13 @@ public:
}
} else {
//
// MISS handling
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_block && !config_.write_through) {
// write back dirty block
auto& repl_block = set.blocks.at(repl_block_id);
@@ -472,6 +509,7 @@ public:
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
mem_req.write = true;
mem_req_ports_.at(bank_id).send(mem_req, 1);
++perf_stats_.evictions;
}
}
@@ -500,9 +538,10 @@ public:
if (pending == -1) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = pipeline_req.write;
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req_ports_.at(bank_id).send(mem_req, 1);
++pending_fill_reqs_;
}
}
}
@@ -513,7 +552,7 @@ public:
///////////////////////////////////////////////////////////////////////////////
Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config)
Cache::Cache(const SimContext& ctx, const char* name, const Config& config)
: SimObject<Cache>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
@@ -528,4 +567,8 @@ Cache::~Cache() {
void Cache::step(uint64_t cycle) {
impl_->step(cycle);
}
const Cache::PerfStats& Cache::perf_stats() const {
return impl_->perf_stats();
}

View File

@@ -5,33 +5,58 @@
namespace vortex {
struct CacheConfig {
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t W; // log2 word size
uint8_t A; // log2 associativity
uint8_t addr_width; // word address bits
uint8_t num_banks; // number of banks
uint8_t ports_per_bank; // number of ports per bank
uint8_t num_inputs; // number of inputs
bool write_through; // is write-through
bool write_reponse; // enable write response
uint16_t victim_size; // victim cache size
uint16_t mshr_size; // MSHR buffer size
uint8_t latency; // pipeline latency
};
class Cache : public SimObject<Cache> {
class Cache : public SimObject<Cache> {
public:
Cache(const SimContext& ctx, const char* name, const CacheConfig& config);
struct Config {
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t W; // log2 word size
uint8_t A; // log2 associativity
uint8_t addr_width; // word address bits
uint8_t num_banks; // number of banks
uint8_t ports_per_bank; // number of ports per bank
uint8_t num_inputs; // number of inputs
bool write_through; // is write-through
bool write_reponse; // enable write response
uint16_t victim_size; // victim cache size
uint16_t mshr_size; // MSHR buffer size
uint8_t latency; // pipeline latency
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t read_misses;
uint64_t write_misses;
uint64_t evictions;
uint64_t pipeline_stalls;
uint64_t bank_stalls;
uint64_t mshr_stalls;
uint64_t mem_latency;
PerfStats()
: reads(0)
, writes(0)
, read_misses(0)
, write_misses(0)
, evictions(0)
, pipeline_stalls(0)
, bank_stalls(0)
, mshr_stalls(0)
, mem_latency(0)
{}
};
std::vector<SimPort<MemReq>> CoreReqPorts;
std::vector<SimPort<MemRsp>> CoreRspPorts;
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
Cache(const SimContext& ctx, const char* name, const Config& config);
~Cache();
void step(uint64_t cycle);
std::vector<SlavePort<MemReq>> CoreReqPorts;
std::vector<MasterPort<MemRsp>> CoreRspPorts;
MasterPort<MemReq> MemReqPort;
SlavePort<MemRsp> MemRspPort;
const PerfStats& perf_stats() const;
private:
class Impl;

View File

@@ -3,14 +3,14 @@
#include "types.h"
#ifndef MEM_LATENCY
#define MEM_LATENCY 18
#define MEM_LATENCY 24
#endif
namespace vortex {
struct Constants {
enum Constants {
static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE;
SMEM_BANK_OFFSET = log2ceil(sizeof(Word)) + log2ceil(STACK_SIZE / sizeof(Word)),
};

View File

@@ -9,16 +9,18 @@
#include "decode.h"
#include "core.h"
#include "debug.h"
#include "constants.h"
using namespace vortex;
Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
: SimObject(ctx, "Core")
, MemRspPort(this)
, MemReqPort(this)
, id_(id)
, arch_(arch)
, decoder_(arch)
, mmu_(0, arch.wsize(), true)
, shared_mem_(4096)
, tex_units_(NUM_TEX_UNITS, this)
, warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0)
@@ -27,7 +29,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, exe_units_((int)ExeType::MAX)
, icache_(Cache::Create("Icache", CacheConfig{
, icache_(Cache::Create("Icache", Cache::Config{
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
2, // W
@@ -42,7 +44,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
NUM_WARPS, // mshr
2, // pipeline latency
}))
, dcache_(Cache::Create("Dcache", CacheConfig{
, dcache_(Cache::Create("Dcache", Cache::Config{
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
2, // W
@@ -55,37 +57,41 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
2, // pipeline latency
4, // pipeline latency
}))
, shared_mem_(SharedMem::Create("sharedmem", SharedMem::Config{
arch.num_threads(),
arch.num_threads(),
Constants::SMEM_BANK_OFFSET,
1,
false
}))
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
, dcache_switch_(arch.num_threads())
, fetch_stage_("fetch")
, decode_stage_("decode")
, issue_stage_("issue")
, execute_stage_("execute")
, commit_stage_("writeback")
, fetch_latch_("fetch")
, decode_latch_("decode")
, pending_icache_(arch_.num_warps())
, active_warps_(1)
, stalled_warps_(0)
, last_schedule_wid_(0)
, issued_instrs_(0)
, committed_instrs_(0)
, csr_tex_unit_(0)
, ecall_(false)
, ebreak_(false)
, stats_insts_(0)
, MemRspPort(this)
, MemReqPort(this)
, perf_mem_pending_reads_(0)
{
for (int i = 0; i < arch_.num_warps(); ++i) {
warps_.at(i) = std::make_shared<Warp>(this, i);
}
// register execute units
exe_units_.at((int)ExeType::NOP) = std::make_shared<NopUnit>(this);
exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);
exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);
exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().CreateObject<NopUnit>(this);
exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().CreateObject<AluUnit>(this);
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().CreateObject<LsuUnit>(this);
exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().CreateObject<CsrUnit>(this);
exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().CreateObject<FpuUnit>(this);
exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().CreateObject<GpuUnit>(this);
// connect l1 switch
icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
@@ -109,6 +115,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
// activate warp0
warps_.at(0)->setTmask(0, true);
// memory perf callbacks
MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
__unused (cycle);
perf_stats_.mem_reads += !req.write;
perf_stats_.mem_writes += req.write;
perf_mem_pending_reads_ += !req.write;
});
MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
__unused (cycle);
--perf_mem_pending_reads_;
});
}
Core::~Core() {
@@ -128,23 +146,26 @@ void Core::attach_ram(RAM* ram) {
void Core::step(uint64_t cycle) {
this->commit(cycle);
this->execute(cycle);
this->issue(cycle);
this->decode(cycle);
this->fetch(cycle);
this->schedule(cycle);
// update perf counter
perf_stats_.mem_latency += perf_mem_pending_reads_;
DPN(2, std::flush);
}
void Core::warp_scheduler(uint64_t cycle) {
void Core::schedule(uint64_t cycle) {
__unused (cycle);
bool foundSchedule = false;
int scheduled_warp = last_schedule_wid_;
// round robin scheduling
for (size_t wid = 0; wid < warps_.size(); ++wid) {
scheduled_warp = (scheduled_warp + 1) % warps_.size();
bool warp_active = warps_.at(scheduled_warp)->active();
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
scheduled_warp = (scheduled_warp + 1) % nw;
bool warp_active = active_warps_.test(scheduled_warp);
bool warp_stalled = stalled_warps_.test(scheduled_warp);
if (warp_active && !warp_stalled) {
last_schedule_wid_ = scheduled_warp;
@@ -159,85 +180,91 @@ void Core::warp_scheduler(uint64_t cycle) {
// suspend warp until decode
stalled_warps_.set(scheduled_warp);
auto& warp = warps_.at(scheduled_warp);
stats_insts_ += warp->getActiveThreads();
auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_);
auto& warp = warps_.at(scheduled_warp);
uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_;
auto trace = new pipeline_trace_t(uuid, arch_);
warp->eval(trace);
DT(3, cycle, "pipeline-schedule: " << *trace);
// advance to fetch stage
fetch_stage_.push(trace);
fetch_latch_.push(trace);
}
void Core::fetch(uint64_t cycle) {
__unused (cycle);
// handle icache reponse
auto& icache_rsp_port = icache_->CoreRspPorts.at(0);
if (!icache_rsp_port.empty()){
auto& mem_rsp = icache_rsp_port.top();
auto& mem_rsp = icache_rsp_port.front();
auto trace = pending_icache_.at(mem_rsp.tag);
auto latency = (SimPlatform::instance().cycles() - trace->icache_latency);
trace->icache_latency = latency;
decode_stage_.push(trace);
decode_latch_.push(trace);
DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
pending_icache_.release(mem_rsp.tag);
icache_rsp_port.pop();
}
// send icache request
if (!fetch_stage_.empty()) {
auto trace = fetch_stage_.top();
trace->icache_latency = SimPlatform::instance().cycles();
if (!fetch_latch_.empty()) {
auto trace = fetch_latch_.front();
MemReq mem_req;
mem_req.addr = trace->PC;
mem_req.write = false;
mem_req.tag = pending_icache_.allocate(trace);
icache_->CoreReqPorts.at(0).send(mem_req, 1);
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_stage_.pop();
}
// schedule next warp
this->warp_scheduler(cycle);
fetch_latch_.pop();
}
}
void Core::decode(uint64_t cycle) {
__unused (cycle);
if (decode_stage_.empty())
if (decode_latch_.empty())
return;
auto trace = decode_stage_.top();
auto trace = decode_latch_.front();
// check ibuffer capacity
auto& ibuffer = ibuffers_.at(trace->wid);
if (ibuffer.full()) {
if (!trace->suspend()) {
DT(3, cycle, "*** ibuffer-stall: " << *trace);
}
++perf_stats_.ibuf_stalls;
return;
} else {
trace->resume();
}
// release warp
if (!trace->fetch_stall) {
stalled_warps_.reset(trace->wid);
}
// update perf counters
uint32_t active_threads = trace->tmask.count();
if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::LOAD)
perf_stats_.loads += active_threads;
if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::STORE)
perf_stats_.stores += active_threads;
if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH)
perf_stats_.branches += active_threads;
DT(3, cycle, "pipeline-decode: " << *trace);
// advance to issue stage
issue_stage_.push(trace);
decode_stage_.pop();
// insert to ibuffer
ibuffer.push(trace);
decode_latch_.pop();
}
void Core::issue(uint64_t cycle) {
__unused (cycle);
if (!issue_stage_.empty()) {
// insert to ibuffer
auto trace = issue_stage_.top();
auto& ibuffer = ibuffers_.at(trace->wid);
if (!trace->check_stalled(ibuffer.full())) {
DT(3, cycle, "*** ibuffer-stall: " << *trace);
}
if (!ibuffer.full()) {
ibuffer.push(trace);
issue_stage_.pop();
}
}
void Core::execute(uint64_t cycle) {
__unused (cycle);
// issue ibuffer instructions
for (auto& ibuffer : ibuffers_) {
@@ -247,180 +274,102 @@ void Core::issue(uint64_t cycle) {
auto trace = ibuffer.top();
// check scoreboard
if (!trace->check_stalled(scoreboard_.in_use(trace))) {
DTH(3, cycle, "*** scoreboard-stall: dependents={");
auto uses = scoreboard_.get_uses(trace);
for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
auto& use = uses.at(i);
__unused(use);
if (i) DTN(3, ", ");
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
if (scoreboard_.in_use(trace)) {
if (!trace->suspend()) {
DTH(3, cycle, "*** scoreboard-stall: dependents={");
auto uses = scoreboard_.get_uses(trace);
for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
auto& use = uses.at(i);
__unused (use);
if (i) DTN(3, ", ");
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
}
DTN(3, "}, " << *trace << std::endl);
}
DTN(3, "}, " << *trace << std::endl);
}
if (scoreboard_.in_use(trace))
++perf_stats_.scrb_stalls;
continue;
DT(3, cycle, "pipeline-issue: " << *trace);
} else {
trace->resume();
}
// update scoreboard
scoreboard_.reserve(trace);
// advance to execute stage
execute_stage_.push(trace);
DT(3, cycle, "pipeline-issue: " << *trace);
// push to execute units
auto& exe_unit = exe_units_.at((int)trace->exe_type);
exe_unit->Input.send(trace, 1);
ibuffer.pop();
break;
}
}
void Core::execute(uint64_t cycle) {
// process stage inputs
if (!execute_stage_.empty()) {
auto trace = execute_stage_.top();
auto& exe_unit = exe_units_.at((int)trace->exe_type);
exe_unit->push(trace);
DT(3, cycle, "pipeline-execute: " << *trace);
execute_stage_.pop();
}
// advance execute units
for (auto& exe_unit : exe_units_) {
exe_unit->step(cycle);
}
// commit completed instructions
for (auto& exe_unit : exe_units_) {
if (!exe_unit->empty()) {
auto trace = exe_unit->top();
if (trace->fetch_stall) {
stalled_warps_.reset(trace->wid);
}
// advance to commit stage
commit_stage_.push(trace);
exe_unit->pop();
}
}
}
void Core::commit(uint64_t cycle) {
__unused (cycle);
if (commit_stage_.empty())
return;
// commit completed instructions
bool wb = false;
for (auto& exe_unit : exe_units_) {
if (!exe_unit->Output.empty()) {
auto trace = exe_unit->Output.front();
auto trace = commit_stage_.top();
// allow only one commit that updates registers
if (trace->wb && wb)
continue;
wb |= trace->wb;
DT(3, cycle, "pipeline-commit: " << *trace);
// advance to commit stage
DT(3, cycle, "pipeline-commit: " << *trace);
// update scoreboard
scoreboard_.release(trace);
// update scoreboard
scoreboard_.release(trace);
assert(committed_instrs_ <= issued_instrs_);
++committed_instrs_;
assert(committed_instrs_ <= issued_instrs_);
++committed_instrs_;
commit_stage_.pop();
perf_stats_.instrs += trace->tmask.count();
// delete the trace
delete trace;
}
// delete the trace
delete trace;
bool Core::running() const {
bool is_running = (committed_instrs_ != issued_instrs_);
return is_running;
}
Word Core::get_csr(Addr addr, int tid, int wid) {
if (addr == CSR_FFLAGS) {
return fcsrs_.at(wid) & 0x1F;
} else if (addr == CSR_FRM) {
return (fcsrs_.at(wid) >> 5);
} else if (addr == CSR_FCSR) {
return fcsrs_.at(wid);
} else if (addr == CSR_WTID) {
// Warp threadID
return tid;
} else if (addr == CSR_LTID) {
// Core threadID
return tid + (wid * arch_.num_threads());
} else if (addr == CSR_GTID) {
// Processor threadID
return tid + (wid * arch_.num_threads()) +
(arch_.num_threads() * arch_.num_warps() * id_);
} else if (addr == CSR_LWID) {
// Core warpID
return wid;
} else if (addr == CSR_GWID) {
// Processor warpID
return wid + (arch_.num_warps() * id_);
} else if (addr == CSR_GCID) {
// Processor coreID
return id_;
} else if (addr == CSR_TMASK) {
// Processor coreID
return warps_.at(wid)->getTmask();
} else if (addr == CSR_NT) {
// Number of threads per warp
return arch_.num_threads();
} else if (addr == CSR_NW) {
// Number of warps per core
return arch_.num_warps();
} else if (addr == CSR_NC) {
// Number of cores
return arch_.num_cores();
} else if (addr == CSR_MINSTRET) {
// NumInsts
return stats_insts_;
} else if (addr == CSR_MINSTRET_H) {
// NumInsts
return (Word)(stats_insts_ >> 32);
} else if (addr == CSR_MCYCLE) {
// NumCycles
return (Word)SimPlatform::instance().cycles();
} else if (addr == CSR_MCYCLE_H) {
// NumCycles
return (Word)(SimPlatform::instance().cycles() >> 32);
} else {
if (addr >= CSR_TEX(0,0)
&& addr < CSR_TEX(NUM_TEX_UNITS,0)) {
uint32_t unit = CSR_TEX_UNIT(addr);
uint32_t state = CSR_TEX_STATE(addr);
return tex_units_.at(unit).get_state(state);
exe_unit->Output.pop();
}
return csrs_.at(addr);
}
}
void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) {
if (addr == CSR_FFLAGS) {
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F);
} else if (addr == CSR_FRM) {
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5);
} else if (addr == CSR_FCSR) {
fcsrs_.at(wid) = value & 0xff;
} else {
if (addr >= CSR_TEX(0,0)
&& addr < CSR_TEX(NUM_TEX_UNITS,0)) {
uint32_t unit = CSR_TEX_UNIT(addr);
uint32_t state = CSR_TEX_STATE(addr);
tex_units_.at(unit).set_state(state, value);
return;
}
csrs_.at(addr) = value;
WarpMask Core::wspawn(int num_warps, int nextPC) {
WarpMask ret(1);
int active_warps = std::min<int>(num_warps, arch_.num_warps());
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
for (int i = 1; i < active_warps; ++i) {
auto warp = warps_.at(i);
warp->setPC(nextPC);
warp->setTmask(0, true);
ret.set(i);
}
return std::move(ret);
}
void Core::barrier(int bar_id, int count, int warp_id) {
WarpMask Core::barrier(int bar_id, int count, int warp_id) {
WarpMask ret(0);
auto& barrier = barriers_.at(bar_id);
barrier.set(warp_id);
if (barrier.count() < (size_t)count)
return;
if (barrier.count() < (size_t)count) {
warps_.at(warp_id)->suspend();
DP(3, "*** Suspend warp #" << warp_id << " at barrier #" << bar_id);
return std::move(ret);
}
for (int i = 0; i < arch_.num_warps(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume warp #" << i << " at barrier #" << bar_id);
warps_.at(i)->activate();
ret.set(i);
}
}
barrier.reset();
return std::move(ret);
}
Word Core::icache_read(Addr addr, Size size) {
@@ -430,35 +379,21 @@ Word Core::icache_read(Addr addr, Size size) {
}
Word Core::dcache_read(Addr addr, Size size) {
Word data = 0;
if (SM_ENABLE) {
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
&& ((addr + 3) < SMEM_BASE_ADDR)) {
shared_mem_.read(&data, addr & (SMEM_SIZE-1), size);
return data;
}
}
Word data;
mmu_.read(&data, addr, size, 0);
return data;
}
void Core::dcache_write(Addr addr, Word data, Size size) {
if (SM_ENABLE) {
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
&& ((addr + 3) < SMEM_BASE_ADDR)) {
shared_mem_.write(&data, addr & (SMEM_SIZE-1), size);
return;
}
}
if (addr >= IO_COUT_ADDR
&& addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {
this->writeToStdOut(addr, data);
return;
} else {
mmu_.write(&data, addr, size, 0);
}
mmu_.write(&data, addr, size, 0);
}
Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector<uint64_t>* mem_addrs) {
Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector<mem_addr_size_t>* mem_addrs) {
return tex_units_.at(unit).read(u, v, lod, mem_addrs);
}
@@ -473,6 +408,228 @@ void Core::writeToStdOut(Addr addr, Word data) {
}
}
Word Core::get_csr(Addr addr, int tid, int wid) {
switch (addr) {
case CSR_SATP:
case CSR_PMPCFG0:
case CSR_PMPADDR0:
case CSR_MSTATUS:
case CSR_MISA:
case CSR_MEDELEG:
case CSR_MIDELEG:
case CSR_MIE:
case CSR_MTVEC:
case CSR_MEPC:
return 0;
case CSR_FFLAGS:
return fcsrs_.at(wid) & 0x1F;
case CSR_FRM:
return (fcsrs_.at(wid) >> 5);
case CSR_FCSR:
return fcsrs_.at(wid);
case CSR_WTID:
// Warp threadID
return tid;
case CSR_LTID:
// Core threadID
return tid + (wid * arch_.num_threads());
case CSR_GTID:
// Processor threadID
return tid + (wid * arch_.num_threads()) +
(arch_.num_threads() * arch_.num_warps() * id_);
case CSR_LWID:
// Core warpID
return wid;
case CSR_GWID:
// Processor warpID
return wid + (arch_.num_warps() * id_);
case CSR_GCID:
// Processor coreID
return id_;
case CSR_TMASK:
// Processor coreID
return warps_.at(wid)->getTmask();
case CSR_NT:
// Number of threads per warp
return arch_.num_threads();
case CSR_NW:
// Number of warps per core
return arch_.num_warps();
case CSR_NC:
// Number of cores
return arch_.num_cores();
case CSR_MINSTRET:
// NumInsts
return perf_stats_.instrs & 0xffffffff;
case CSR_MINSTRET_H:
// NumInsts
return (Word)(perf_stats_.instrs >> 32);
case CSR_MCYCLE:
// NumCycles
return (Word)SimPlatform::instance().cycles();
case CSR_MCYCLE_H:
// NumCycles
return (Word)(SimPlatform::instance().cycles() >> 32);
case CSR_MPM_IBUF_ST:
return perf_stats_.ibuf_stalls & 0xffffffff;
case CSR_MPM_IBUF_ST_H:
return perf_stats_.ibuf_stalls >> 32;
case CSR_MPM_SCRB_ST:
return perf_stats_.scrb_stalls & 0xffffffff;
case CSR_MPM_SCRB_ST_H:
return perf_stats_.scrb_stalls >> 32;
case CSR_MPM_ALU_ST:
return perf_stats_.alu_stalls & 0xffffffff;
case CSR_MPM_ALU_ST_H:
return perf_stats_.alu_stalls >> 32;
case CSR_MPM_LSU_ST:
return perf_stats_.lsu_stalls & 0xffffffff;
case CSR_MPM_LSU_ST_H:
return perf_stats_.lsu_stalls >> 32;
case CSR_MPM_CSR_ST:
return perf_stats_.csr_stalls & 0xffffffff;
case CSR_MPM_CSR_ST_H:
return perf_stats_.csr_stalls >> 32;
case CSR_MPM_FPU_ST:
return perf_stats_.fpu_stalls & 0xffffffff;
case CSR_MPM_FPU_ST_H:
return perf_stats_.fpu_stalls >> 32;
case CSR_MPM_GPU_ST:
return perf_stats_.gpu_stalls & 0xffffffff;
case CSR_MPM_GPU_ST_H:
return perf_stats_.gpu_stalls >> 32;
case CSR_MPM_LOADS:
return perf_stats_.loads & 0xffffffff;
case CSR_MPM_LOADS_H:
return perf_stats_.loads >> 32;
case CSR_MPM_STORES:
return perf_stats_.stores & 0xffffffff;
case CSR_MPM_STORES_H:
return perf_stats_.stores >> 32;
case CSR_MPM_BRANCHES:
return perf_stats_.branches & 0xffffffff;
case CSR_MPM_BRANCHES_H:
return perf_stats_.branches >> 32;
case CSR_MPM_ICACHE_READS:
return icache_->perf_stats().reads & 0xffffffff;
case CSR_MPM_ICACHE_READS_H:
return icache_->perf_stats().reads >> 32;
case CSR_MPM_ICACHE_MISS_R:
return icache_->perf_stats().read_misses & 0xffffffff;
case CSR_MPM_ICACHE_MISS_R_H:
return icache_->perf_stats().read_misses >> 32;
case CSR_MPM_DCACHE_READS:
return dcache_->perf_stats().reads & 0xffffffff;
case CSR_MPM_DCACHE_READS_H:
return dcache_->perf_stats().reads >> 32;
case CSR_MPM_DCACHE_WRITES:
return dcache_->perf_stats().writes & 0xffffffff;
case CSR_MPM_DCACHE_WRITES_H:
return dcache_->perf_stats().writes >> 32;
case CSR_MPM_DCACHE_MISS_R:
return dcache_->perf_stats().read_misses & 0xffffffff;
case CSR_MPM_DCACHE_MISS_R_H:
return dcache_->perf_stats().read_misses >> 32;
case CSR_MPM_DCACHE_MISS_W:
return dcache_->perf_stats().write_misses & 0xffffffff;
case CSR_MPM_DCACHE_MISS_W_H:
return dcache_->perf_stats().write_misses >> 32;
case CSR_MPM_DCACHE_BANK_ST:
return dcache_->perf_stats().bank_stalls & 0xffffffff;
case CSR_MPM_DCACHE_BANK_ST_H:
return dcache_->perf_stats().bank_stalls >> 32;
case CSR_MPM_DCACHE_MSHR_ST:
return dcache_->perf_stats().mshr_stalls & 0xffffffff;
case CSR_MPM_DCACHE_MSHR_ST_H:
return dcache_->perf_stats().mshr_stalls >> 32;
case CSR_MPM_SMEM_READS:
return shared_mem_->perf_stats().reads & 0xffffffff;
case CSR_MPM_SMEM_READS_H:
return shared_mem_->perf_stats().reads >> 32;
case CSR_MPM_SMEM_WRITES:
return shared_mem_->perf_stats().writes & 0xffffffff;
case CSR_MPM_SMEM_WRITES_H:
return shared_mem_->perf_stats().writes >> 32;
case CSR_MPM_SMEM_BANK_ST:
return shared_mem_->perf_stats().bank_stalls & 0xffffffff;
case CSR_MPM_SMEM_BANK_ST_H:
return shared_mem_->perf_stats().bank_stalls >> 32;
case CSR_MPM_MEM_READS:
return perf_stats_.mem_reads & 0xffffffff;
case CSR_MPM_MEM_READS_H:
return perf_stats_.mem_reads >> 32;
case CSR_MPM_MEM_WRITES:
return perf_stats_.mem_writes & 0xffffffff;
case CSR_MPM_MEM_WRITES_H:
return perf_stats_.mem_writes >> 32;
case CSR_MPM_MEM_LAT:
return perf_stats_.mem_latency & 0xffffffff;
case CSR_MPM_MEM_LAT_H:
return perf_stats_.mem_latency >> 32;
#ifdef EXT_TEX_ENABLE
case CSR_MPM_TEX_READS:
return perf_stats_.tex_reads & 0xffffffff;
case CSR_MPM_TEX_READS_H:
return perf_stats_.tex_reads >> 32;
case CSR_MPM_TEX_LAT:
return perf_stats_.tex_latency & 0xffffffff;
case CSR_MPM_TEX_LAT_H:
return perf_stats_.tex_latency >> 32;
#endif
default:
if ((addr >= CSR_MPM_BASE && addr < (CSR_MPM_BASE + 32))
|| (addr >= CSR_MPM_BASE_H && addr < (CSR_MPM_BASE_H + 32))) {
// user-defined MPM CSRs
} else
#ifdef EXT_TEX_ENABLE
if (addr == CSR_TEX_UNIT) {
return csr_tex_unit_;
} else
if (addr >= CSR_TEX_STATE_BEGIN
&& addr < CSR_TEX_STATE_END) {
uint32_t state = CSR_TEX_STATE(addr);
return tex_units_.at(csr_tex_unit_).get_state(state);
} else
#endif
{
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
std::abort();
}
}
return 0;
}
void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) {
if (addr == CSR_FFLAGS) {
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F);
} else if (addr == CSR_FRM) {
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5);
} else if (addr == CSR_FCSR) {
fcsrs_.at(wid) = value & 0xff;
} else
#ifdef EXT_TEX_ENABLE
if (addr == CSR_TEX_UNIT) {
csr_tex_unit_ = value;
} else
if (addr >= CSR_TEX_STATE_BEGIN
&& addr < CSR_TEX_STATE_END) {
uint32_t state = CSR_TEX_STATE(addr);
tex_units_.at(csr_tex_unit_).set_state(state, value);
return;
} else
#endif
{
csrs_.at(addr) = value;
}
}
void Core::trigger_ecall() {
ecall_ = true;
}
@@ -483,4 +640,9 @@ void Core::trigger_ebreak() {
bool Core::check_exit() const {
return ebreak_ || ecall_;
}
bool Core::running() const {
bool is_running = (committed_instrs_ != issued_instrs_);
return is_running;
}

View File

@@ -17,6 +17,7 @@
#include "warp.h"
#include "pipeline.h"
#include "cache.h"
#include "sharedmem.h"
#include "ibuffer.h"
#include "scoreboard.h"
#include "exeunit.h"
@@ -26,6 +27,47 @@ namespace vortex {
class Core : public SimObject<Core> {
public:
struct PerfStats {
uint64_t instrs;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
uint64_t lsu_stalls;
uint64_t csr_stalls;
uint64_t fpu_stalls;
uint64_t gpu_stalls;
uint64_t loads;
uint64_t stores;
uint64_t branches;
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
uint64_t tex_reads;
uint64_t tex_latency;
PerfStats()
: instrs(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)
, lsu_stalls(0)
, csr_stalls(0)
, fpu_stalls(0)
, gpu_stalls(0)
, loads(0)
, stores(0)
, branches(0)
, mem_reads(0)
, mem_writes(0)
, mem_latency(0)
, tex_reads(0)
, tex_latency(0)
{}
};
SimPort<MemRsp> MemRspPort;
SimPort<MemReq> MemReqPort;
Core(const SimContext& ctx, const ArchDef &arch, Word id);
~Core();
@@ -51,8 +93,8 @@ public:
return arch_;
}
unsigned long stats_insts() const {
return stats_insts_;
const PerfStats& perf_stats() const {
return perf_stats_;
}
Word getIRegValue(int reg) const {
@@ -63,7 +105,9 @@ public:
void set_csr(Addr addr, Word value, int tid, int wid);
void barrier(int bar_id, int count, int warp_id);
WarpMask wspawn(int num_warps, int nextPC);
WarpMask barrier(int bar_id, int count, int warp_id);
Word icache_read(Addr, Size);
@@ -71,7 +115,7 @@ public:
void dcache_write(Addr, Word, Size);
Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector<uint64_t>* mem_addrs);
Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector<mem_addr_size_t>* mem_addrs);
void trigger_ecall();
@@ -81,21 +125,18 @@ public:
private:
void schedule(uint64_t cycle);
void fetch(uint64_t cycle);
void decode(uint64_t cycle);
void issue(uint64_t cycle);
void execute(uint64_t cycle);
void commit(uint64_t cycle);
void warp_scheduler(uint64_t cycle);
void writeToStdOut(Addr addr, Word data);
Word id_;
const ArchDef arch_;
const Decoder decoder_;
MemoryUnit mmu_;
RAM shared_mem_;
std::vector<TexUnit> tex_units_;
std::vector<std::shared_ptr<Warp>> warps_;
@@ -107,33 +148,33 @@ private:
std::vector<ExeUnit::Ptr> exe_units_;
Cache::Ptr icache_;
Cache::Ptr dcache_;
SharedMem::Ptr shared_mem_;
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
PipelineStage fetch_stage_;
PipelineStage decode_stage_;
PipelineStage issue_stage_;
PipelineStage execute_stage_;
PipelineStage commit_stage_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;
HashTable<pipeline_trace_t*> pending_icache_;
WarpMask stalled_warps_;
WarpMask active_warps_;
WarpMask stalled_warps_;
uint32_t last_schedule_wid_;
uint32_t issued_instrs_;
uint32_t committed_instrs_;
uint64_t issued_instrs_;
uint64_t committed_instrs_;
uint32_t csr_tex_unit_;
bool ecall_;
bool ebreak_;
std::unordered_map<int, std::stringstream> print_bufs_;
uint64_t stats_insts_;
PerfStats perf_stats_;
uint64_t perf_mem_pending_reads_;
friend class LsuUnit;
friend class AluUnit;
friend class CsrUnit;
friend class FpuUnit;
friend class GpuUnit;
public:
SlavePort<MemRsp> MemRspPort;
MasterPort<MemReq> MemReqPort;
};
} // namespace vortex

View File

@@ -359,14 +359,28 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
instr->setDestReg(rd);
}
instr->setFunc3(func3);
instr->setFunc7(func7);
if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) {
instr->setImm(sext32(rs2, 5));
} else {
instr->setFunc7(func7);
switch (op) {
case Opcode::SYS_INST:
case Opcode::FENCE:
// uint12
instr->setImm(code >> shift_rs2_);
break;
case Opcode::I_INST:
if (func3 == 0x1 || func3 == 0x5) {
// int5
instr->setImm(sext32(rs2, 5));
} else {
// int12
instr->setImm(sext32(code >> shift_rs2_, 12));
}
break;
default:
// int12
instr->setImm(sext32(code >> shift_rs2_, 12));
break;
}
} break;
case InstType::S_TYPE: {
instr->setSrcReg(rs1);
if (op == Opcode::FS) {
@@ -375,8 +389,8 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
instr->setSrcReg(rs2);
}
instr->setFunc3(func3);
Word imeed = (func7 << reg_s_) | rd;
instr->setImm(sext32(imeed, 12));
Word imm = (func7 << reg_s_) | rd;
instr->setImm(sext32(imm, 12));
} break;
case InstType::B_TYPE: {
@@ -387,8 +401,8 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
Word bits_4_1 = rd >> 1;
Word bit_10_5 = func7 & 0x3f;
Word bit_12 = func7 >> 6;
Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12);
instr->setImm(sext32(imeed, 13));
Word imm = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12);
instr->setImm(sext32(imm, 13));
} break;
case InstType::U_TYPE:
@@ -403,11 +417,11 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
Word bit_11 = (unordered >> 8) & 0x1;
Word bits_10_1 = (unordered >> 9) & 0x3ff;
Word bit_20 = (unordered >> 19) & 0x1;
Word imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
Word imm = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
if (bit_20) {
imeed |= ~j_imm_mask_;
imm |= ~j_imm_mask_;
}
instr->setImm(imeed);
instr->setImm(imm);
} break;
case InstType::V_TYPE:

View File

@@ -428,7 +428,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
Word data_read = core_->dcache_read(memAddr, 4);
trace->mem_addrs.at(t).push_back(memAddr);
trace->mem_addrs.at(t).push_back({memAddr, 4});
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
switch (func3) {
case 0:
@@ -491,7 +491,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
if (!tmask_.test(t))
continue;
Word memAddr = rsdata[t][0] + immsrc;
trace->mem_addrs.at(t).push_back(memAddr);
trace->mem_addrs.at(t).push_back({memAddr, (1u << func3)});
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (func3) {
case 0:
@@ -528,14 +528,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
}
break;
case SYS_INST:
trace->exe_type = ExeType::CSR;
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word csr_addr = immsrc & 0x00000FFF;
Word csr_value = core_->get_csr(csr_addr, t, id_);
switch (func3) {
case 0:
Word csr_addr = immsrc;
Word csr_value;
if (func3 == 0) {
trace->exe_type = ExeType::ALU;
trace->fetch_stall = true;
switch (csr_addr) {
case 0: // ECALL
core_->trigger_ecall();
@@ -549,56 +549,59 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
break;
default:
std::abort();
}
break;
case 1:
// CSRRW
rddata[t] = csr_value;
core_->set_csr(csr_addr, rsdata[t][0], t, id_);
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 2:
// CSRRS
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_);
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 3:
// CSRRC
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_);
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 5:
// CSRRWI
rddata[t] = csr_value;
core_->set_csr(csr_addr, rsrc0, t, id_);
rd_write = true;
break;
case 6:
// CSRRSI
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value | rsrc0, t, id_);
rd_write = true;
break;
case 7:
// CSRRCI
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_);
rd_write = true;
break;
default:
break;
}
} else {
trace->exe_type = ExeType::CSR;
csr_value = core_->get_csr(csr_addr, t, id_);
switch (func3) {
case 1:
// CSRRW
rddata[t] = csr_value;
core_->set_csr(csr_addr, rsdata[t][0], t, id_);
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 2:
// CSRRS
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_);
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 3:
// CSRRC
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_);
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 5:
// CSRRWI
rddata[t] = csr_value;
core_->set_csr(csr_addr, rsrc0, t, id_);
rd_write = true;
break;
case 6:
// CSRRSI;
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value | rsrc0, t, id_);
rd_write = true;
break;
case 7:
// CSRRCI
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_);
rd_write = true;
break;
default:
break;
}
}
}
break;
case FENCE:
trace->exe_type = ExeType::LSU;
trace->lsu.type = LsuType::FENCE;
trace->fetch_stall = true;
break;
case FCI:
trace->exe_type = ExeType::FPU;
@@ -797,6 +800,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
DPN(3, std::endl);
active_ = tmask_.any();
trace->gpu.active_warps.reset();
trace->gpu.active_warps.set(id_, active_);
} break;
case 1: {
// WSPAWN
@@ -805,13 +810,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
for (int i = 1; i < active_warps; ++i) {
Warp &newWarp = core_->warp(i);
newWarp.setPC(rsdata[ts][1]);
newWarp.setTmask(0, true);
}
trace->gpu.active_warps = core_->wspawn(rsdata.at(ts)[0], rsdata.at(ts)[1]);
} break;
case 2: {
// SPLIT
@@ -877,9 +876,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->gpu.type = GpuType::BAR;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
active_ = false;
core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
trace->fetch_stall = true;
trace->gpu.active_warps = core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
} break;
case 5: {
// PREFETCH

View File

@@ -10,64 +10,78 @@
using namespace vortex;
NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
void NopUnit::step(uint64_t /*cycle*/) {
if (inputs_.empty())
if (Input.empty())
return;
auto trace = inputs_.top();
this->schedule_output(trace, 1);
inputs_.pop();
auto trace = Input.front();
Output.send(trace, 1);
Input.pop();
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(Core* core)
: ExeUnit("LSU")
, core_(core)
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, num_threads_(core->arch().num_threads())
, pending_dcache_(LSUQ_SIZE)
, fence_lock_(false)
{}
void LsuUnit::step(uint64_t cycle) {
__unused (cycle);
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.top();
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_dcache_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
trace->dcache_latency = latency;
this->schedule_output(trace, 1);
if (0 == entry.second) {
Output.send(trace, 1);
pending_dcache_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
// handle shared memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_dcache_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_dcache_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_dcache_.empty())
return;
this->schedule_output(fence_state_, 1);
Output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, cycle, "fence-unlock: " << fence_state_);
}
// check input queue
if (inputs_.empty())
if (Input.empty())
return;
auto trace = inputs_.top();
auto trace = Input.front();
if (trace->lsu.type == LsuType::FENCE) {
// schedule fence lock
@@ -75,179 +89,188 @@ void LsuUnit::step(uint64_t cycle) {
fence_lock_ = true;
DT(3, cycle, "fence-lock: " << *trace);
// remove input
inputs_.pop();
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (cycle - time);
return;
}
// check pending queue capacity
if (!trace->check_stalled(pending_dcache_.full())) {
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
}
if (pending_dcache_.full())
// check pending queue capacity
if (pending_dcache_.full()) {
if (!trace->suspend()) {
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
}
return;
// send memory request
bool has_shared_memory = false;
bool mem_rsp_pending = false;
} else {
trace->resume();
}
bool is_write = (trace->lsu.type == LsuType::STORE);
uint32_t valid_addrs = 0;
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(0)) {
uint64_t addr_mask = sizeof(Word)-1;
Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t valid_addrs = 0;
if (is_dup) {
valid_addrs = 1;
} else {
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
}
trace->dcache_latency = SimPlatform::instance().cycles();
auto tag = pending_dcache_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
auto mem_addr = trace->mem_addrs.at(t).at(0);
auto type = get_addr_type(mem_addr.addr, mem_addr.size);
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
for (auto mem_addr : trace->mem_addrs.at(t)) {
// check shared memory address
if (SM_ENABLE) {
if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE))
&& (mem_addr < SMEM_BASE_ADDR)) {
DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
has_shared_memory = true;
continue;
}
}
bool is_io = (mem_addr >= IO_BASE_ADDR);
MemReq mem_req;
mem_req.addr = mem_addr;
mem_req.write = is_write;
mem_req.tag = tag;
mem_req.is_io = is_io;
dcache_req_port.send(mem_req, 1);
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace);
// do not wait on writes
mem_rsp_pending = !is_write;
}
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.tag = tag;
mem_req.is_io = (type == AddrType::IO);
if (type == AddrType::Shared) {
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
} else {
dcache_req_port.send(mem_req, 2);
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace);
}
if (is_dup)
break;
}
// do not wait
if (!mem_rsp_pending) {
// do not wait on writes
if (is_write) {
pending_dcache_.release(tag);
uint32_t delay = 1;
if (has_shared_memory) {
// all threads accessed shared memory
delay += Constants::SMEM_DELAY;
}
this->schedule_output(trace, delay);
Output.send(trace, 1);
}
// remove input
inputs_.pop();
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (cycle - time);
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::step(uint64_t /*cycle*/) {
if (inputs_.empty())
void AluUnit::step(uint64_t cycle) {
if (Input.empty())
return;
auto trace = inputs_.top();
auto trace = Input.front();
switch (trace->alu.type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::CMOV:
this->schedule_output(trace, 1);
inputs_.pop();
Output.send(trace, 1);
break;
case AluType::IMUL:
this->schedule_output(trace, LATENCY_IMUL);
inputs_.pop();
Output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
this->schedule_output(trace, XLEN);
inputs_.pop();
Output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.alu_stalls += (cycle - time);
}
///////////////////////////////////////////////////////////////////////////////
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
void CsrUnit::step(uint64_t /*cycle*/) {
if (inputs_.empty())
void CsrUnit::step(uint64_t cycle) {
if (Input.empty())
return;
auto trace = inputs_.top();
this->schedule_output(trace, 1);
inputs_.pop();
auto trace = Input.front();
Output.send(trace, 1);
auto time = Input.pop();
core_->perf_stats_.csr_stalls += (cycle - time);
DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::step(uint64_t /*cycle*/) {
if (inputs_.empty())
void FpuUnit::step(uint64_t cycle) {
if (Input.empty())
return;
auto trace = inputs_.top();
auto trace = Input.front();
switch (trace->fpu.type) {
case FpuType::FNCP:
this->schedule_output(trace, 1);
inputs_.pop();
Output.send(trace, 2);
break;
case FpuType::FMA:
this->schedule_output(trace, LATENCY_FMA);
inputs_.pop();
Output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
this->schedule_output(trace, LATENCY_FDIV);
inputs_.pop();
Output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
this->schedule_output(trace, LATENCY_FSQRT);
inputs_.pop();
Output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
this->schedule_output(trace, LATENCY_FCVT);
inputs_.pop();
Output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
}
DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (cycle - time);
}
///////////////////////////////////////////////////////////////////////////////
GpuUnit::GpuUnit(Core* core)
: ExeUnit("GPU")
, core_(core)
GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "GPU")
, num_threads_(core->arch().num_threads())
, pending_tex_reqs_(TEXQ_SIZE)
{}
void GpuUnit::step(uint64_t cycle) {
__unused (cycle);
#ifdef EXT_TEX_ENABLE
// handle memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.top();
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
trace->dcache_latency = latency;
this->schedule_output(trace, 1);
if (0 == entry.second) {
Output.send(trace, 1);
pending_tex_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
@@ -255,38 +278,67 @@ void GpuUnit::step(uint64_t cycle) {
#endif
// check input queue
if (inputs_.empty())
if (Input.empty())
return;
auto trace = inputs_.top();
auto trace = Input.front();
bool issued = false;
switch (trace->gpu.type) {
case GpuType::TMC:
Output.send(trace, 1);
core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
issued = true;
break;
case GpuType::WSPAWN:
Output.send(trace, 1);
core_->active_warps_ = trace->gpu.active_warps;
issued = true;
break;
case GpuType::SPLIT:
case GpuType::JOIN:
case GpuType::BAR:
this->schedule_output(trace, 1);
inputs_.pop();
Output.send(trace, 1);
issued = true;
break;
case GpuType::TEX: {
case GpuType::BAR:
Output.send(trace, 1);
if (trace->gpu.active_warps != 0)
core_->active_warps_ |= trace->gpu.active_warps;
else
core_->active_warps_.reset(trace->wid);
issued = true;
break;
case GpuType::TEX:
if (this->processTexRequest(cycle, trace))
inputs_.pop();
} break;
issued = true;
break;
default:
std::abort();
}
if (issued) {
DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (cycle - time);
}
}
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
__unused (cycle);
// check pending queue capacity
if (!trace->check_stalled(pending_tex_reqs_.full())) {
DT(3, cycle, "*** tex-queue-stall: " << *trace);
}
if (pending_tex_reqs_.full())
// check pending queue capacity
if (pending_tex_reqs_.full()) {
if (!trace->suspend()) {
DT(3, cycle, "*** tex-queue-stall: " << *trace);
}
return false;
} else {
trace->resume();
}
// send memory request
@@ -295,7 +347,6 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
valid_addrs += mem_addr.size();
}
trace->tex_latency = SimPlatform::instance().cycles();
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
@@ -305,12 +356,14 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
for (auto mem_addr : trace->mem_addrs.at(t)) {
MemReq mem_req;
mem_req.addr = mem_addr;
mem_req.addr = mem_addr.addr;
mem_req.write = (trace->lsu.type == LsuType::STORE);
mem_req.tag = tag;
dcache_req_port.send(mem_req, 1);
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag
dcache_req_port.send(mem_req, 3);
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", tid=" << t << ", "<< trace);
++ core_->perf_stats_.tex_reads;
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
}
}

View File

@@ -8,56 +8,29 @@ namespace vortex {
class Core;
class ExeUnit {
protected:
const char* name_;
Queue<pipeline_trace_t*> inputs_;
Queue<pipeline_trace_t*> outputs_;
class ExeUnit : public SimObject<ExeUnit> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
void schedule_output(pipeline_trace_t* trace, uint32_t delay) {
if (delay > 1) {
SimPlatform::instance().schedule(
[&](pipeline_trace_t* req) {
outputs_.push(req);
},
trace,
(delay - 1)
);
} else {
outputs_.push(trace);
}
}
public:
typedef std::shared_ptr<ExeUnit> Ptr;
ExeUnit(const char* name) : name_(name) {}
ExeUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<ExeUnit>(ctx, name)
, Input(this)
, Output(this)
, core_(core)
{}
virtual ~ExeUnit() {}
void push(pipeline_trace_t* trace) {
inputs_.push(trace);
}
bool empty() const {
return outputs_.empty();
}
pipeline_trace_t* top() const {
return outputs_.top();
}
void pop() {
outputs_.pop();
}
virtual void step(uint64_t cycle) = 0;
protected:
Core* core_;
};
///////////////////////////////////////////////////////////////////////////////
class NopUnit : public ExeUnit {
public:
NopUnit(Core*);
NopUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
};
@@ -65,15 +38,14 @@ public:
///////////////////////////////////////////////////////////////////////////////
class LsuUnit : public ExeUnit {
private:
Core* core_;
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
pipeline_trace_t* fence_state_;
bool fence_lock_;
public:
LsuUnit(Core*);
LsuUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
};
@@ -82,7 +54,7 @@ public:
class AluUnit : public ExeUnit {
public:
AluUnit(Core*);
AluUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
};
@@ -91,7 +63,7 @@ public:
class CsrUnit : public ExeUnit {
public:
CsrUnit(Core*);
CsrUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
};
@@ -100,7 +72,7 @@ public:
class FpuUnit : public ExeUnit {
public:
FpuUnit(Core*);
FpuUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
};
@@ -109,14 +81,13 @@ public:
class GpuUnit : public ExeUnit {
private:
Core* core_;
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
public:
GpuUnit(Core*);
GpuUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
};

View File

@@ -10,6 +10,7 @@ private:
MemSim* simobject_;
uint32_t num_banks_;
uint32_t latency_;
PerfStats perf_stats_;
public:
Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency)
@@ -18,16 +19,23 @@ public:
, latency_(latency)
{}
const PerfStats& perf_stats() const {
return perf_stats_;
}
void step(uint64_t /*cycle*/) {
for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
auto& mem_req_port = simobject_->MemReqPorts.at(i);
if (mem_req_port.empty())
continue;
auto& mem_req = mem_req_port.top();
auto& mem_req = mem_req_port.front();
if (!mem_req.write) {
MemRsp mem_rsp;
mem_rsp.tag = mem_req.tag;
simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
++perf_stats_.reads;
} else {
++perf_stats_.writes;
}
mem_req_port.pop();
}
@@ -40,9 +48,9 @@ MemSim::MemSim(const SimContext& ctx,
uint32_t num_banks,
uint32_t latency)
: SimObject<MemSim>(ctx, "MemSim")
, impl_(new Impl(this, num_banks, latency))
, MemReqPorts(num_banks, this)
, MemRspPorts(num_banks, this)
, impl_(new Impl(this, num_banks, latency))
{}
MemSim::~MemSim() {

View File

@@ -1,47 +1,36 @@
#pragma once
#include <simobject.h>
#include "types.h"
#include <vector>
#include <list>
namespace vortex {
struct MemReq {
uint64_t addr;
uint32_t tag;
bool write;
bool is_io;
MemReq(uint64_t _addr = 0,
uint64_t _tag = 0,
bool _write = false,
bool _is_io = false
) : addr(_addr)
, tag(_tag)
, write(_write)
, is_io(_is_io)
{}
};
struct MemRsp {
uint64_t tag;
MemRsp(uint64_t _tag = 0) : tag (_tag) {}
};
class MemSim : public SimObject<MemSim>{
private:
class Impl;
Impl* impl_;
public:
struct PerfStats {
uint64_t reads;
uint64_t writes;
MemSim(const SimContext& ctx, uint32_t num_inputs, uint32_t latency);
PerfStats()
: reads(0)
, writes(0)
{}
};
std::vector<SimPort<MemReq>> MemReqPorts;
std::vector<SimPort<MemRsp>> MemRspPorts;
MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency);
~MemSim();
void step(uint64_t cycle);
std::vector<SlavePort<MemReq>> MemReqPorts;
std::vector<MasterPort<MemRsp>> MemRspPorts;
const PerfStats& perf_stats() const;
private:
class Impl;
Impl* impl_;
};
};

View File

@@ -12,7 +12,7 @@ namespace vortex {
struct pipeline_trace_t {
//--
uint64_t id;
uint64_t uuid;
//--
int cid;
@@ -22,7 +22,6 @@ struct pipeline_trace_t {
//--
bool fetch_stall;
bool pipeline_stall;
//--
bool wb;
@@ -38,7 +37,7 @@ struct pipeline_trace_t {
ExeType exe_type;
//--
std::vector<std::vector<uint64_t>> mem_addrs;
std::vector<std::vector<mem_addr_size_t>> mem_addrs;
//--
union {
@@ -53,22 +52,19 @@ struct pipeline_trace_t {
} fpu;
struct {
GpuType type;
WarpMask active_warps;
} gpu;
};
// stats
uint64_t icache_latency;
uint64_t dcache_latency;
uint64_t tex_latency;
bool stalled;
pipeline_trace_t(uint64_t id_, const ArchDef& arch) {
id = id_;
pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
uuid = uuid_;
cid = 0;
wid = 0;
tmask.reset();
PC = 0;
PC = 0;
fetch_stall = false;
pipeline_stall = false;
wb = false;
rdest = 0;
rdest_type = RegType::None;
@@ -76,16 +72,18 @@ struct pipeline_trace_t {
used_fregs.reset();
used_vregs.reset();
exe_type = ExeType::NOP;
mem_addrs.resize(arch.num_threads());
icache_latency = 0;
dcache_latency = 0;
tex_latency = 0;
mem_addrs.resize(arch.num_threads());
stalled = false;
}
bool check_stalled(bool stall) {
bool old = pipeline_stall;
pipeline_stall = stall;
return stall ? old : true;
bool suspend() {
bool old = stalled;
stalled = true;
return old;
}
void resume() {
stalled = false;
}
};
@@ -96,16 +94,16 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state)
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
}
os << ", ex=" << state.exe_type;
os << " (#" << std::dec << state.id << ")";
os << " (#" << std::dec << state.uuid << ")";
return os;
}
class PipelineStage : public Queue<pipeline_trace_t*> {
class PipelineLatch : public Queue<pipeline_trace_t*> {
protected:
const char* name_;
public:
PipelineStage(const char* name = nullptr)
PipelineLatch(const char* name = nullptr)
: name_(name)
{}
};

View File

@@ -18,13 +18,13 @@ Processor::Processor(const ArchDef& arch)
// connect memory sub-systen
memsim_ = MemSim::Create(1, MEM_LATENCY);
std::vector<SlavePort<MemReq>*> mem_req_ports(1);
std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
std::vector<SimPort<MemReq>*> mem_req_ports(1);
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", CacheConfig{
l3cache_ = Cache::Create("l3cache", Cache::Config{
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
@@ -66,7 +66,7 @@ Processor::Processor(const ArchDef& arch)
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", CacheConfig{
l2cache = Cache::Create("l2cache", Cache::Config{
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W

View File

@@ -96,7 +96,7 @@ public:
}
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
assert(owners_.count(tag) == 0);
owners_[tag] = state->id;
owners_[tag] = state->uuid;
}
void release(pipeline_trace_t* state) {

93
sim/simX/sharedmem.h Normal file
View File

@@ -0,0 +1,93 @@
#pragma once
#include <simobject.h>
#include <bitmanip.h>
#include <vector>
#include "types.h"
namespace vortex {
class Core;
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t num_reqs;
uint32_t num_banks;
uint32_t bank_offset;
uint32_t latency;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, config_(config)
, bank_sel_addr_start_(config.bank_offset)
, bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
{}
virtual ~SharedMem() {}
void step(uint64_t /*cycle*/) {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = this->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = (uint32_t)bit_getw(
core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
// bank conflict check
if (in_used_banks.at(bank_id))
continue;
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp;
core_rsp.tag = core_req.tag;
this->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
protected:
Config config_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
};
}

View File

@@ -27,7 +27,7 @@ void TexUnit::set_state(uint32_t state, uint32_t value) {
uint32_t TexUnit::read(int32_t u,
int32_t v,
int32_t lod,
std::vector<uint64_t>* mem_addrs) {
std::vector<mem_addr_size_t>* mem_addrs) {
//--
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
@@ -60,10 +60,10 @@ uint32_t TexUnit::read(int32_t u,
uint32_t texel10 = core_->dcache_read(addr10, stride);
uint32_t texel11 = core_->dcache_read(addr11, stride);
mem_addrs->push_back(addr00);
mem_addrs->push_back(addr01);
mem_addrs->push_back(addr10);
mem_addrs->push_back(addr11);
mem_addrs->push_back({addr00, stride});
mem_addrs->push_back({addr01, stride});
mem_addrs->push_back({addr10, stride});
mem_addrs->push_back({addr11, stride});
// filtering
auto color = TexFilterLinear(
@@ -79,7 +79,7 @@ uint32_t TexUnit::read(int32_t u,
// memory lookup
uint32_t texel = core_->dcache_read(addr, stride);
mem_addrs->push_back(addr);
mem_addrs->push_back({addr, stride});
// filtering
auto color = TexFilterPoint(format, texel);

View File

@@ -15,7 +15,7 @@ public:
void set_state(uint32_t state, uint32_t value);
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<uint64_t>* mem_addrs);
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
private:

View File

@@ -21,6 +21,8 @@ typedef std::bitset<32> RegMask;
typedef std::bitset<32> ThreadMask;
typedef std::bitset<32> WarpMask;
///////////////////////////////////////////////////////////////////////////////
enum class RegType {
None,
Integer,
@@ -38,6 +40,8 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
return os;
}
///////////////////////////////////////////////////////////////////////////////
enum class ExeType {
NOP,
ALU,
@@ -61,6 +65,8 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
return os;
}
///////////////////////////////////////////////////////////////////////////////
enum class AluType {
ARITH,
BRANCH,
@@ -80,6 +86,8 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
return os;
}
///////////////////////////////////////////////////////////////////////////////
enum class LsuType {
LOAD,
STORE,
@@ -97,6 +105,47 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
return os;
}
///////////////////////////////////////////////////////////////////////////////
enum class AddrType {
Global,
Shared,
IO,
};
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
switch (type) {
case AddrType::Global: os << "Global"; break;
case AddrType::Shared: os << "Shared"; break;
case AddrType::IO: os << "IO"; break;
}
return os;
}
///////////////////////////////////////////////////////////////////////////////
struct mem_addr_size_t {
uint64_t addr;
uint32_t size;
};
inline AddrType get_addr_type(Word addr, uint32_t size) {
__unused (size);
if (SM_ENABLE) {
if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
&& addr < SMEM_BASE_ADDR) {
assert((addr + size) <= SMEM_BASE_ADDR);
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
return AddrType::Global;
}
///////////////////////////////////////////////////////////////////////////////
enum class FpuType {
FNCP,
FMA,
@@ -116,6 +165,8 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
return os;
}
///////////////////////////////////////////////////////////////////////////////
enum class GpuType {
TMC,
WSPAWN,
@@ -137,6 +188,8 @@ inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
return os;
}
///////////////////////////////////////////////////////////////////////////////
enum class ArbiterType {
Priority,
RoundRobin
@@ -152,6 +205,30 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
///////////////////////////////////////////////////////////////////////////////
struct MemReq {
uint64_t addr;
uint32_t tag;
bool write;
bool is_io;
MemReq(uint64_t _addr = 0,
uint64_t _tag = 0,
bool _write = false,
bool _is_io = false
) : addr(_addr)
, tag(_tag)
, write(_write)
, is_io(_is_io)
{}
};
struct MemRsp {
uint64_t tag;
MemRsp(uint64_t _tag = 0) : tag (_tag) {}
};
///////////////////////////////////////////////////////////////////////////////
template <typename T>
class Queue {
protected:
@@ -164,21 +241,29 @@ public:
return queue_.empty();
}
const T& top() const {
const T& front() const {
return queue_.front();
}
T& top() {
T& front() {
return queue_.front();
}
void pop() {
queue_.pop();
const T& back() const {
return queue_.back();
}
T& back() {
return queue_.back();
}
void push(const T& value) {
queue_.push(value);
}
void pop() {
queue_.pop();
}
};
///////////////////////////////////////////////////////////////////////////////
@@ -187,20 +272,24 @@ template <typename T>
class HashTable {
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t capacity_;
uint32_t size_;
public:
HashTable(uint32_t size)
: entries_(size)
, capacity_(0)
HashTable(uint32_t capacity)
: entries_(capacity)
, size_(0)
{}
bool empty() const {
return (0 == capacity_);
return (0 == size_);
}
bool full() const {
return (capacity_ == entries_.size());
return (size_ == entries_.size());
}
uint32_t size() const {
return size_;
}
bool contains(uint32_t index) const {
@@ -225,7 +314,7 @@ public:
if (!entry.first) {
entry.first = true;
entry.second = value;
++capacity_;
++size_;
return i;
}
}
@@ -237,7 +326,7 @@ public:
auto& entry = entries_.at(index);
assert(entry.first);
entry.first = false;
--capacity_;
--size_;
}
};
@@ -287,7 +376,7 @@ public:
uint32_t j = (cursor_ + i) % n;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.top();
auto& req = req_in.front();
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
}
@@ -300,7 +389,7 @@ public:
// process incoming reponses
if (!RspIn.empty()) {
auto& rsp = RspIn.top();
auto& rsp = RspIn.front();
uint32_t port_id = 0;
if (tag_shift_) {
port_id = rsp.tag & ((1 << tag_shift_)-1);
@@ -317,10 +406,10 @@ public:
}
}
std::vector<SlavePort<Req>> ReqIn;
MasterPort<Req> ReqOut;
SlavePort<Rsp> RspIn;
std::vector<MasterPort<Rsp>> RspOut;
std::vector<SimPort<Req>> ReqIn;
SimPort<Req> ReqOut;
SimPort<Rsp> RspIn;
std::vector<SimPort<Rsp>> RspOut;
};
}

View File

@@ -27,7 +27,7 @@ void Warp::eval(pipeline_trace_t *trace) {
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
DPN(2, tmask_.test(n-i-1));
DPN(2, ", PC=0x" << std::hex << PC_ << std::endl);
DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
/* Fetch and decode. */
@@ -38,7 +38,7 @@ void Warp::eval(pipeline_trace_t *trace) {
std::abort();
}
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")");
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
// Update trace
trace->cid = core_->id();

View File

@@ -46,6 +46,10 @@ public:
return active_;
}
void suspend() {
active_ = false;
}
void activate() {
active_ = true;
}