simx directory name fix
This commit is contained in:
52
sim/simx/Makefile
Normal file
52
sim/simx/Makefile
Normal file
@@ -0,0 +1,52 @@
|
||||
RTL_DIR = ../hw/rtl
|
||||
THIRD_PARTY_DIR = ../../third_party
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I. -I../common -I../../hw
|
||||
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
|
||||
CXXFLAGS += -I$(THIRD_PARTY_DIR)/cocogfx/include
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
|
||||
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx
|
||||
|
||||
TOP = vx_cache_sim
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp main.cpp
|
||||
|
||||
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
|
||||
VPATH := $(sort $(dir $(SRCS)))
|
||||
|
||||
#$(info OBJS is $(OBJS))
|
||||
#$(info VPATH is $(VPATH))
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = simx
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
obj_dir/%.o: %.cpp
|
||||
mkdir -p obj_dir
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
static: $(OBJS)
|
||||
$(AR) rcs lib$(PROJECT).a $(OBJS) $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean-static:
|
||||
rm -rf lib$(PROJECT).a obj_dir .depend
|
||||
|
||||
clean: clean-static
|
||||
rm -rf $(PROJECT)
|
||||
71
sim/simx/archdef.h
Normal file
71
sim/simx/archdef.h
Normal file
@@ -0,0 +1,71 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include <cstdlib>
|
||||
#include <stdio.h>
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class ArchDef {
|
||||
private:
|
||||
uint16_t num_cores_;
|
||||
uint16_t num_warps_;
|
||||
uint16_t num_threads_;
|
||||
uint16_t wsize_;
|
||||
uint16_t vsize_;
|
||||
uint16_t num_regs_;
|
||||
uint16_t num_csrs_;
|
||||
uint16_t num_barriers_;
|
||||
|
||||
public:
|
||||
ArchDef(const std::string& /*arch*/,
|
||||
uint16_t num_cores,
|
||||
uint16_t num_warps,
|
||||
uint16_t num_threads)
|
||||
: num_cores_(num_cores)
|
||||
, num_warps_(num_warps)
|
||||
, num_threads_(num_threads)
|
||||
, wsize_(4)
|
||||
, vsize_(16)
|
||||
, num_regs_(32)
|
||||
, num_csrs_(4096)
|
||||
, num_barriers_(NUM_BARRIERS)
|
||||
{}
|
||||
|
||||
uint16_t wsize() const {
|
||||
return wsize_;
|
||||
}
|
||||
|
||||
uint16_t vsize() const {
|
||||
return vsize_;
|
||||
}
|
||||
|
||||
uint16_t num_regs() const {
|
||||
return num_regs_;
|
||||
}
|
||||
|
||||
uint16_t num_csrs() const {
|
||||
return num_csrs_;
|
||||
}
|
||||
|
||||
uint16_t num_barriers() const {
|
||||
return num_barriers_;
|
||||
}
|
||||
|
||||
uint16_t num_threads() const {
|
||||
return num_threads_;
|
||||
}
|
||||
|
||||
uint16_t num_warps() const {
|
||||
return num_warps_;
|
||||
}
|
||||
|
||||
uint16_t num_cores() const {
|
||||
return num_cores_;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
47
sim/simx/args.cpp
Normal file
47
sim/simx/args.cpp
Normal file
@@ -0,0 +1,47 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include "args.h"
|
||||
|
||||
using namespace vortex;
|
||||
using std::string;
|
||||
|
||||
std::string CommandLineArg::helpString_;
|
||||
std::unordered_map<string, CommandLineArg *> CommandLineArg::longArgs_;
|
||||
std::unordered_map<string, CommandLineArg *> CommandLineArg::shortArgs_;
|
||||
|
||||
CommandLineArg::CommandLineArg(string s, string l, const char *helpText) {
|
||||
helpString_ += helpText;
|
||||
longArgs_[l] = this;
|
||||
shortArgs_[s] = this;
|
||||
}
|
||||
|
||||
CommandLineArg::CommandLineArg(string l, const char *helpText) {
|
||||
helpString_ += helpText;
|
||||
longArgs_[l] = this;
|
||||
}
|
||||
|
||||
void CommandLineArg::readArgs(int argc, char **argv) {
|
||||
for (int i = 0; i < argc; i++) {
|
||||
std::unordered_map<string, CommandLineArg *>::iterator
|
||||
s = shortArgs_.find(std::string(argv[i])),
|
||||
l = longArgs_.find(std::string(argv[i]));
|
||||
|
||||
if (s != shortArgs_.end()) {
|
||||
i += s->second->read(argc - i, &argv[i]);
|
||||
} else if (l != longArgs_.end()) {
|
||||
i += l->second->read(argc - i, &argv[i]);
|
||||
} else {
|
||||
throw BadArg(string(argv[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CommandLineArg::clearArgs() {
|
||||
shortArgs_.clear();
|
||||
longArgs_.clear();
|
||||
helpString_ = "";
|
||||
}
|
||||
|
||||
void CommandLineArg::showHelp(std::ostream &os) {
|
||||
os << helpString_;
|
||||
}
|
||||
64
sim/simx/args.h
Normal file
64
sim/simx/args.h
Normal file
@@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
#include <util.h>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
struct BadArg { BadArg(std::string s) : arg(s) {} std::string arg; };
|
||||
|
||||
class CommandLineArg {
|
||||
public:
|
||||
CommandLineArg(std::string s, std::string l, const char *helpText);
|
||||
CommandLineArg(std::string l, const char *helpText);
|
||||
virtual int read(int argc, char** argv) = 0;
|
||||
|
||||
static void readArgs(int argc, char **argv);
|
||||
static void clearArgs();
|
||||
static void showHelp(std::ostream &os);
|
||||
|
||||
private:
|
||||
static std::string helpString_;
|
||||
static std::unordered_map<std::string, CommandLineArg *> longArgs_;
|
||||
static std::unordered_map<std::string, CommandLineArg *> shortArgs_;
|
||||
};
|
||||
|
||||
template <typename T> class CommandLineArgSetter : public CommandLineArg {
|
||||
public:
|
||||
CommandLineArgSetter(std::string s, std::string l, const char *ht, T &x) :
|
||||
CommandLineArg(s, l, ht), arg_(x) {}
|
||||
|
||||
CommandLineArgSetter(std::string l, const char *ht, T &x) :
|
||||
CommandLineArg(l, ht), arg_(x) {}
|
||||
|
||||
int read(int argc, char **argv) {
|
||||
__unused (argc);
|
||||
std::istringstream iss(argv[1]);
|
||||
iss >> arg_;
|
||||
return 1;
|
||||
}
|
||||
private:
|
||||
T &arg_;
|
||||
};
|
||||
|
||||
class CommandLineArgFlag : public CommandLineArg {
|
||||
public:
|
||||
CommandLineArgFlag(std::string s, std::string l, const char *ht, bool &x) :
|
||||
CommandLineArg(s, l, ht), arg_(x) { arg_ = false; }
|
||||
|
||||
CommandLineArgFlag(std::string l, const char *ht, bool &x) :
|
||||
CommandLineArg(l, ht), arg_(x) { arg_ = false; }
|
||||
|
||||
int read(int argc, char **argv) {
|
||||
__unused (argc, argv);
|
||||
arg_ = true;
|
||||
return 0;
|
||||
}
|
||||
private:
|
||||
bool &arg_;
|
||||
};
|
||||
|
||||
}
|
||||
574
sim/simx/cache.cpp
Normal file
574
sim/simx/cache.cpp
Normal file
@@ -0,0 +1,574 @@
|
||||
#include "cache.h"
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include <util.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <queue>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
struct params_t {
|
||||
uint32_t sets_per_bank;
|
||||
uint32_t blocks_per_set;
|
||||
uint32_t words_per_block;
|
||||
uint32_t log2_num_inputs;
|
||||
|
||||
uint32_t word_select_addr_start;
|
||||
uint32_t word_select_addr_end;
|
||||
|
||||
uint32_t bank_select_addr_start;
|
||||
uint32_t bank_select_addr_end;
|
||||
|
||||
uint32_t set_select_addr_start;
|
||||
uint32_t set_select_addr_end;
|
||||
|
||||
uint32_t tag_select_addr_start;
|
||||
uint32_t tag_select_addr_end;
|
||||
|
||||
params_t(const Cache::Config& config) {
|
||||
uint32_t bank_bits = log2ceil(config.num_banks);
|
||||
uint32_t offset_bits = config.B - config.W;
|
||||
uint32_t log2_bank_size = config.C - bank_bits;
|
||||
uint32_t index_bits = log2_bank_size - (config.B << config.A);
|
||||
assert(log2_bank_size >= config.B);
|
||||
|
||||
this->log2_num_inputs = log2ceil(config.num_inputs);
|
||||
|
||||
this->words_per_block = 1 << offset_bits;
|
||||
this->blocks_per_set = 1 << config.A;
|
||||
this->sets_per_bank = 1 << index_bits;
|
||||
|
||||
assert(config.ports_per_bank <= this->words_per_block);
|
||||
|
||||
// Word select
|
||||
this->word_select_addr_start = config.W;
|
||||
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
|
||||
|
||||
// Bank select
|
||||
this->bank_select_addr_start = (1+this->word_select_addr_end);
|
||||
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
|
||||
|
||||
// Set select
|
||||
this->set_select_addr_start = (1+this->bank_select_addr_end);
|
||||
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
|
||||
|
||||
// Tag select
|
||||
this->tag_select_addr_start = (1+this->set_select_addr_end);
|
||||
this->tag_select_addr_end = (config.addr_width-1);
|
||||
}
|
||||
|
||||
uint32_t addr_bank_id(uint64_t word_addr) const {
|
||||
if (bank_select_addr_end >= bank_select_addr_start)
|
||||
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t addr_set_id(uint64_t word_addr) const {
|
||||
if (set_select_addr_end >= set_select_addr_start)
|
||||
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t addr_tag(uint64_t word_addr) const {
|
||||
if (tag_select_addr_end >= tag_select_addr_start)
|
||||
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
|
||||
uint64_t addr(0);
|
||||
if (bank_select_addr_end >= bank_select_addr_start)
|
||||
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
|
||||
if (set_select_addr_end >= set_select_addr_start)
|
||||
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
|
||||
if (tag_select_addr_end >= tag_select_addr_start)
|
||||
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
|
||||
return addr;
|
||||
}
|
||||
};
|
||||
|
||||
struct block_t {
|
||||
bool valid;
|
||||
bool dirty;
|
||||
uint64_t tag;
|
||||
uint32_t lru_ctr;
|
||||
};
|
||||
|
||||
struct set_t {
|
||||
std::vector<block_t> blocks;
|
||||
set_t(uint32_t size) : blocks(size) {}
|
||||
};
|
||||
|
||||
struct bank_req_info_t {
|
||||
bool valid;
|
||||
uint32_t req_id;
|
||||
uint64_t req_tag;
|
||||
};
|
||||
|
||||
struct bank_req_t {
|
||||
bool valid;
|
||||
bool write;
|
||||
bool mshr_replay;
|
||||
uint64_t tag;
|
||||
uint32_t set_id;
|
||||
std::vector<bank_req_info_t> infos;
|
||||
|
||||
bank_req_t(uint32_t size)
|
||||
: valid(false)
|
||||
, write(false)
|
||||
, mshr_replay(false)
|
||||
, tag(0)
|
||||
, set_id(0)
|
||||
, infos(size)
|
||||
{}
|
||||
};
|
||||
|
||||
struct mshr_entry_t : public bank_req_t {
|
||||
uint32_t block_id;
|
||||
|
||||
mshr_entry_t(uint32_t size = 0)
|
||||
: bank_req_t(size)
|
||||
, block_id(0)
|
||||
{}
|
||||
};
|
||||
|
||||
class MSHR {
|
||||
private:
|
||||
std::vector<mshr_entry_t> entries_;
|
||||
uint32_t capacity_;
|
||||
|
||||
public:
|
||||
MSHR(uint32_t size)
|
||||
: entries_(size)
|
||||
, capacity_(0)
|
||||
{}
|
||||
|
||||
bool empty() const {
|
||||
return (0 == capacity_);
|
||||
}
|
||||
|
||||
bool full() const {
|
||||
return (capacity_ == entries_.size());
|
||||
}
|
||||
|
||||
int lookup(const bank_req_t& bank_req) {
|
||||
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
|
||||
auto& entry = entries_.at(i);
|
||||
if (entry.valid
|
||||
&& entry.set_id == bank_req.set_id
|
||||
&& entry.tag == bank_req.tag) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int allocate(const bank_req_t& bank_req, uint32_t block_id) {
|
||||
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
|
||||
auto& entry = entries_.at(i);
|
||||
if (!entry.valid) {
|
||||
*(bank_req_t*)&entry = bank_req;
|
||||
entry.valid = true;
|
||||
entry.mshr_replay = false;
|
||||
entry.block_id = block_id;
|
||||
++capacity_;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
mshr_entry_t& replay(uint32_t id) {
|
||||
auto& root_entry = entries_.at(id);
|
||||
assert(root_entry.valid);
|
||||
// make all related mshr entries for replay
|
||||
for (auto& entry : entries_) {
|
||||
if (entry.valid
|
||||
&& entry.set_id == root_entry.set_id
|
||||
&& entry.tag == root_entry.tag) {
|
||||
entry.mshr_replay = true;
|
||||
}
|
||||
}
|
||||
return root_entry;
|
||||
}
|
||||
|
||||
bool pop(bank_req_t* out) {
|
||||
for (auto& entry : entries_) {
|
||||
if (entry.valid && entry.mshr_replay) {
|
||||
*out = entry;
|
||||
entry.valid = false;
|
||||
--capacity_;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
struct bank_t {
|
||||
std::vector<set_t> sets;
|
||||
MSHR mshr;
|
||||
|
||||
bank_t(const Cache::Config& config,
|
||||
const params_t& params)
|
||||
: sets(params.sets_per_bank, params.blocks_per_set)
|
||||
, mshr(config.mshr_size)
|
||||
{}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class Cache::Impl {
|
||||
private:
|
||||
Cache* const simobject_;
|
||||
Config config_;
|
||||
params_t params_;
|
||||
std::vector<bank_t> banks_;
|
||||
Switch<MemReq, MemRsp>::Ptr mem_switch_;
|
||||
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
|
||||
std::vector<SimPort<MemReq>> mem_req_ports_;
|
||||
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
|
||||
PerfStats perf_stats_;
|
||||
uint64_t pending_read_reqs_;
|
||||
uint64_t pending_write_reqs_;
|
||||
uint64_t pending_fill_reqs_;
|
||||
uint32_t flush_cycles_;
|
||||
|
||||
public:
|
||||
Impl(Cache* simobject, const Config& config)
|
||||
: simobject_(simobject)
|
||||
, config_(config)
|
||||
, params_(config)
|
||||
, banks_(config.num_banks, {config, params_})
|
||||
, mem_req_ports_(config.num_banks, simobject)
|
||||
, mem_rsp_ports_(config.num_banks, simobject)
|
||||
, pending_read_reqs_(0)
|
||||
, pending_write_reqs_(0)
|
||||
, pending_fill_reqs_(0)
|
||||
{
|
||||
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
|
||||
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
|
||||
simobject->MemRspPort.bind(&bypass_switch_->RspIn);
|
||||
|
||||
if (config.num_banks > 1) {
|
||||
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
|
||||
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
|
||||
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
|
||||
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
|
||||
}
|
||||
mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
|
||||
bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
|
||||
} else {
|
||||
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
|
||||
bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
|
||||
}
|
||||
|
||||
// calculate tag flush cycles
|
||||
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
void step(uint64_t cycle) {
|
||||
// wait on flush cycles
|
||||
if (flush_cycles_ != 0) {
|
||||
--flush_cycles_;
|
||||
return;
|
||||
}
|
||||
|
||||
// calculate memory latency
|
||||
perf_stats_.mem_latency += pending_fill_reqs_;
|
||||
|
||||
// handle bypasss responses
|
||||
auto& bypass_port = bypass_switch_->RspOut.at(1);
|
||||
if (!bypass_port.empty()) {
|
||||
auto& mem_rsp = bypass_port.front();
|
||||
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
|
||||
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
|
||||
MemRsp core_rsp(tag);
|
||||
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
|
||||
bypass_port.pop();
|
||||
}
|
||||
|
||||
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
|
||||
|
||||
// handle MSHR replay
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& pipeline_req = pipeline_reqs.at(bank_id);
|
||||
bank.mshr.pop(&pipeline_req);
|
||||
}
|
||||
|
||||
// handle memory fills
|
||||
std::vector<bool> pending_fill_req(config_.num_banks, false);
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
|
||||
if (!mem_rsp_port.empty()) {
|
||||
auto& mem_rsp = mem_rsp_port.front();
|
||||
this->processMemoryFill(bank_id, mem_rsp.tag);
|
||||
pending_fill_req.at(bank_id) = true;
|
||||
mem_rsp_port.pop();
|
||||
}
|
||||
}
|
||||
|
||||
// handle incoming core requests
|
||||
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
|
||||
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
|
||||
if (core_req_port.empty())
|
||||
continue;
|
||||
|
||||
auto& core_req = core_req_port.front();
|
||||
|
||||
// check cache bypassing
|
||||
if (core_req.is_io) {
|
||||
// send IO request
|
||||
this->processIORequest(core_req, req_id);
|
||||
|
||||
// remove request
|
||||
core_req_port.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
auto bank_id = params_.addr_bank_id(core_req.addr);
|
||||
auto set_id = params_.addr_set_id(core_req.addr);
|
||||
auto tag = params_.addr_tag(core_req.addr);
|
||||
auto port_id = req_id % config_.ports_per_bank;
|
||||
|
||||
// create bank request
|
||||
bank_req_t bank_req(config_.ports_per_bank);
|
||||
bank_req.valid = true;
|
||||
bank_req.write = core_req.write;
|
||||
bank_req.mshr_replay = false;
|
||||
bank_req.tag = tag;
|
||||
bank_req.set_id = set_id;
|
||||
bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
|
||||
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& pipeline_req = pipeline_reqs.at(bank_id);
|
||||
|
||||
// check pending MSHR replay
|
||||
if (pipeline_req.valid
|
||||
&& pipeline_req.mshr_replay) {
|
||||
// stall
|
||||
continue;
|
||||
}
|
||||
|
||||
// check pending fill request
|
||||
if (pending_fill_req.at(bank_id)) {
|
||||
// stall
|
||||
continue;
|
||||
}
|
||||
|
||||
// check MSHR capacity if read or writeback
|
||||
if ((!core_req.write || !config_.write_through)
|
||||
&& bank.mshr.full()) {
|
||||
++perf_stats_.mshr_stalls;
|
||||
continue;
|
||||
}
|
||||
|
||||
// check bank conflicts
|
||||
if (pipeline_req.valid) {
|
||||
// check port conflict
|
||||
if (pipeline_req.write != core_req.write
|
||||
|| pipeline_req.set_id != set_id
|
||||
|| pipeline_req.tag != tag
|
||||
|| pipeline_req.infos[port_id].valid) {
|
||||
++perf_stats_.bank_stalls;
|
||||
continue;
|
||||
}
|
||||
// update pending request infos
|
||||
pipeline_req.infos[port_id] = bank_req.infos[port_id];
|
||||
} else {
|
||||
// schedule new request
|
||||
pipeline_req = bank_req;
|
||||
}
|
||||
|
||||
if (core_req.write)
|
||||
++perf_stats_.writes;
|
||||
else
|
||||
++perf_stats_.reads;
|
||||
|
||||
// remove request
|
||||
auto time = core_req_port.pop();
|
||||
perf_stats_.pipeline_stalls += (cycle - time);
|
||||
}
|
||||
|
||||
// process active request
|
||||
this->processBankRequest(pipeline_reqs);
|
||||
}
|
||||
|
||||
void processIORequest(const MemReq& core_req, uint32_t req_id) {
|
||||
{
|
||||
MemReq mem_req(core_req);
|
||||
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
|
||||
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
|
||||
}
|
||||
|
||||
if (core_req.write && config_.write_reponse) {
|
||||
simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
|
||||
// update block
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& entry = bank.mshr.replay(mshr_id);
|
||||
auto& set = bank.sets.at(entry.set_id);
|
||||
auto& block = set.blocks.at(entry.block_id);
|
||||
block.valid = true;
|
||||
block.tag = entry.tag;
|
||||
--pending_fill_reqs_;
|
||||
}
|
||||
|
||||
void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
auto& pipeline_req = pipeline_reqs.at(bank_id);
|
||||
if (!pipeline_req.valid)
|
||||
continue;
|
||||
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& set = bank.sets.at(pipeline_req.set_id);
|
||||
|
||||
if (pipeline_req.mshr_replay) {
|
||||
// send core response
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
|
||||
}
|
||||
} else {
|
||||
bool hit = false;
|
||||
bool found_free_block = false;
|
||||
int hit_block_id = 0;
|
||||
int repl_block_id = 0;
|
||||
uint32_t max_cnt = 0;
|
||||
|
||||
for (int i = 0, n = set.blocks.size(); i < n; ++i) {
|
||||
auto& block = set.blocks.at(i);
|
||||
if (block.valid) {
|
||||
if (block.tag == pipeline_req.tag) {
|
||||
block.lru_ctr = 0;
|
||||
hit_block_id = i;
|
||||
hit = true;
|
||||
} else {
|
||||
++block.lru_ctr;
|
||||
}
|
||||
if (max_cnt < block.lru_ctr) {
|
||||
max_cnt = block.lru_ctr;
|
||||
repl_block_id = i;
|
||||
}
|
||||
} else {
|
||||
found_free_block = true;
|
||||
repl_block_id = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (hit) {
|
||||
//
|
||||
// Hit handling
|
||||
//
|
||||
if (pipeline_req.write) {
|
||||
// handle write hit
|
||||
auto& hit_block = set.blocks.at(hit_block_id);
|
||||
if (config_.write_through) {
|
||||
// forward write request to memory
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
} else {
|
||||
// mark block as dirty
|
||||
hit_block.dirty = true;
|
||||
}
|
||||
}
|
||||
// send core response
|
||||
if (!pipeline_req.write || config_.write_reponse) {
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//
|
||||
// Miss handling
|
||||
//
|
||||
if (pipeline_req.write)
|
||||
++perf_stats_.write_misses;
|
||||
else
|
||||
++perf_stats_.read_misses;
|
||||
|
||||
if (!found_free_block && !config_.write_through) {
|
||||
// write back dirty block
|
||||
auto& repl_block = set.blocks.at(repl_block_id);
|
||||
if (repl_block.dirty) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
++perf_stats_.evictions;
|
||||
}
|
||||
}
|
||||
|
||||
if (pipeline_req.write && config_.write_through) {
|
||||
// forward write request to memory
|
||||
{
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
}
|
||||
// send core response
|
||||
if (config_.write_reponse) {
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// MSHR lookup
|
||||
int pending = bank.mshr.lookup(pipeline_req);
|
||||
|
||||
// allocate MSHR
|
||||
int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
|
||||
|
||||
// send fill request
|
||||
if (pending == -1) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = false;
|
||||
mem_req.tag = mshr_id;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
++pending_fill_reqs_;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Cache::Cache(const SimContext& ctx, const char* name, const Config& config)
|
||||
: SimObject<Cache>(ctx, name)
|
||||
, CoreReqPorts(config.num_inputs, this)
|
||||
, CoreRspPorts(config.num_inputs, this)
|
||||
, MemReqPort(this)
|
||||
, MemRspPort(this)
|
||||
, impl_(new Impl(this, config))
|
||||
{}
|
||||
|
||||
Cache::~Cache() {
|
||||
delete impl_;
|
||||
}
|
||||
|
||||
void Cache::step(uint64_t cycle) {
|
||||
impl_->step(cycle);
|
||||
}
|
||||
|
||||
const Cache::PerfStats& Cache::perf_stats() const {
|
||||
return impl_->perf_stats();
|
||||
}
|
||||
66
sim/simx/cache.h
Normal file
66
sim/simx/cache.h
Normal file
@@ -0,0 +1,66 @@
|
||||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include "memsim.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Cache : public SimObject<Cache> {
|
||||
public:
|
||||
struct Config {
|
||||
uint8_t C; // log2 cache size
|
||||
uint8_t B; // log2 block size
|
||||
uint8_t W; // log2 word size
|
||||
uint8_t A; // log2 associativity
|
||||
uint8_t addr_width; // word address bits
|
||||
uint8_t num_banks; // number of banks
|
||||
uint8_t ports_per_bank; // number of ports per bank
|
||||
uint8_t num_inputs; // number of inputs
|
||||
bool write_through; // is write-through
|
||||
bool write_reponse; // enable write response
|
||||
uint16_t victim_size; // victim cache size
|
||||
uint16_t mshr_size; // MSHR buffer size
|
||||
uint8_t latency; // pipeline latency
|
||||
};
|
||||
struct PerfStats {
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
uint64_t read_misses;
|
||||
uint64_t write_misses;
|
||||
uint64_t evictions;
|
||||
uint64_t pipeline_stalls;
|
||||
uint64_t bank_stalls;
|
||||
uint64_t mshr_stalls;
|
||||
uint64_t mem_latency;
|
||||
|
||||
PerfStats()
|
||||
: reads(0)
|
||||
, writes(0)
|
||||
, read_misses(0)
|
||||
, write_misses(0)
|
||||
, evictions(0)
|
||||
, pipeline_stalls(0)
|
||||
, bank_stalls(0)
|
||||
, mshr_stalls(0)
|
||||
, mem_latency(0)
|
||||
{}
|
||||
};
|
||||
|
||||
std::vector<SimPort<MemReq>> CoreReqPorts;
|
||||
std::vector<SimPort<MemRsp>> CoreRspPorts;
|
||||
SimPort<MemReq> MemReqPort;
|
||||
SimPort<MemRsp> MemRspPort;
|
||||
|
||||
Cache(const SimContext& ctx, const char* name, const Config& config);
|
||||
~Cache();
|
||||
|
||||
void step(uint64_t cycle);
|
||||
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
Impl* impl_;
|
||||
};
|
||||
|
||||
}
|
||||
17
sim/simx/constants.h
Normal file
17
sim/simx/constants.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
|
||||
#ifndef MEM_LATENCY
|
||||
#define MEM_LATENCY 24
|
||||
#endif
|
||||
|
||||
namespace vortex {
|
||||
|
||||
enum Constants {
|
||||
|
||||
SMEM_BANK_OFFSET = log2ceil(sizeof(Word)) + log2ceil(STACK_SIZE / sizeof(Word)),
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
648
sim/simx/core.cpp
Normal file
648
sim/simx/core.cpp
Normal file
@@ -0,0 +1,648 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <util.h>
|
||||
#include "types.h"
|
||||
#include "archdef.h"
|
||||
#include "mem.h"
|
||||
#include "decode.h"
|
||||
#include "core.h"
|
||||
#include "debug.h"
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
||||
: SimObject(ctx, "Core")
|
||||
, MemRspPort(this)
|
||||
, MemReqPort(this)
|
||||
, id_(id)
|
||||
, arch_(arch)
|
||||
, decoder_(arch)
|
||||
, mmu_(0, arch.wsize(), true)
|
||||
, tex_units_(NUM_TEX_UNITS, this)
|
||||
, warps_(arch.num_warps())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, csrs_(arch.num_csrs(), 0)
|
||||
, fcsrs_(arch.num_warps(), 0)
|
||||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, exe_units_((int)ExeType::MAX)
|
||||
, icache_(Cache::Create("Icache", Cache::Config{
|
||||
log2ceil(ICACHE_SIZE), // C
|
||||
log2ceil(L1_BLOCK_SIZE),// B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
1, // number of banks
|
||||
1, // number of ports
|
||||
1, // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
NUM_WARPS, // mshr
|
||||
2, // pipeline latency
|
||||
}))
|
||||
, dcache_(Cache::Create("Dcache", Cache::Config{
|
||||
log2ceil(DCACHE_SIZE), // C
|
||||
log2ceil(L1_BLOCK_SIZE),// B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
DCACHE_NUM_BANKS, // number of banks
|
||||
DCACHE_NUM_PORTS, // number of ports
|
||||
(uint8_t)arch.num_threads(), // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
DCACHE_MSHR_SIZE, // mshr
|
||||
4, // pipeline latency
|
||||
}))
|
||||
, shared_mem_(SharedMem::Create("sharedmem", SharedMem::Config{
|
||||
arch.num_threads(),
|
||||
arch.num_threads(),
|
||||
Constants::SMEM_BANK_OFFSET,
|
||||
1,
|
||||
false
|
||||
}))
|
||||
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
|
||||
, dcache_switch_(arch.num_threads())
|
||||
, fetch_latch_("fetch")
|
||||
, decode_latch_("decode")
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, active_warps_(1)
|
||||
, stalled_warps_(0)
|
||||
, last_schedule_wid_(0)
|
||||
, issued_instrs_(0)
|
||||
, committed_instrs_(0)
|
||||
, csr_tex_unit_(0)
|
||||
, ecall_(false)
|
||||
, ebreak_(false)
|
||||
, perf_mem_pending_reads_(0)
|
||||
{
|
||||
for (int i = 0; i < arch_.num_warps(); ++i) {
|
||||
warps_.at(i) = std::make_shared<Warp>(this, i);
|
||||
}
|
||||
|
||||
// register execute units
|
||||
exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().CreateObject<NopUnit>(this);
|
||||
exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().CreateObject<AluUnit>(this);
|
||||
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().CreateObject<LsuUnit>(this);
|
||||
exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().CreateObject<CsrUnit>(this);
|
||||
exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().CreateObject<FpuUnit>(this);
|
||||
exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().CreateObject<GpuUnit>(this);
|
||||
|
||||
// connect l1 switch
|
||||
icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
|
||||
dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]);
|
||||
l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort);
|
||||
l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort);
|
||||
this->MemRspPort.bind(&l1_mem_switch_->RspIn);
|
||||
l1_mem_switch_->ReqOut.bind(&this->MemReqPort);
|
||||
|
||||
// lsu/tex switch
|
||||
for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) {
|
||||
auto& sw = dcache_switch_.at(i);
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
sw = Switch<MemReq, MemRsp>::Create("lsu_arb", ArbiterType::Priority, 2);
|
||||
#else
|
||||
sw = Switch<MemReq, MemRsp>::Create("lsu_arb", ArbiterType::Priority, 1);
|
||||
#endif
|
||||
sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i));
|
||||
dcache_->CoreRspPorts.at(i).bind(&sw->RspIn);
|
||||
}
|
||||
|
||||
// activate warp0
|
||||
warps_.at(0)->setTmask(0, true);
|
||||
|
||||
// memory perf callbacks
|
||||
MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
|
||||
__unused (cycle);
|
||||
perf_stats_.mem_reads += !req.write;
|
||||
perf_stats_.mem_writes += req.write;
|
||||
perf_mem_pending_reads_ += !req.write;
|
||||
});
|
||||
MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
|
||||
__unused (cycle);
|
||||
--perf_mem_pending_reads_;
|
||||
});
|
||||
}
|
||||
|
||||
Core::~Core() {
|
||||
for (auto& buf : print_bufs_) {
|
||||
auto str = buf.second.str();
|
||||
if (!str.empty()) {
|
||||
std::cout << "#" << buf.first << ": " << str << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Core::attach_ram(RAM* ram) {
|
||||
// bind RAM to memory unit
|
||||
mmu_.attach(*ram, 0, 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
void Core::step(uint64_t cycle) {
|
||||
this->commit(cycle);
|
||||
this->execute(cycle);
|
||||
this->decode(cycle);
|
||||
this->fetch(cycle);
|
||||
this->schedule(cycle);
|
||||
|
||||
// update perf counter
|
||||
perf_stats_.mem_latency += perf_mem_pending_reads_;
|
||||
|
||||
DPN(2, std::flush);
|
||||
}
|
||||
|
||||
void Core::schedule(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
bool foundSchedule = false;
|
||||
int scheduled_warp = last_schedule_wid_;
|
||||
|
||||
// round robin scheduling
|
||||
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
|
||||
scheduled_warp = (scheduled_warp + 1) % nw;
|
||||
bool warp_active = active_warps_.test(scheduled_warp);
|
||||
bool warp_stalled = stalled_warps_.test(scheduled_warp);
|
||||
if (warp_active && !warp_stalled) {
|
||||
last_schedule_wid_ = scheduled_warp;
|
||||
foundSchedule = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundSchedule)
|
||||
return;
|
||||
|
||||
// suspend warp until decode
|
||||
stalled_warps_.set(scheduled_warp);
|
||||
|
||||
auto& warp = warps_.at(scheduled_warp);
|
||||
|
||||
uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_;
|
||||
|
||||
auto trace = new pipeline_trace_t(uuid, arch_);
|
||||
|
||||
warp->eval(trace);
|
||||
|
||||
DT(3, cycle, "pipeline-schedule: " << *trace);
|
||||
|
||||
// advance to fetch stage
|
||||
fetch_latch_.push(trace);
|
||||
}
|
||||
|
||||
void Core::fetch(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
// handle icache reponse
|
||||
auto& icache_rsp_port = icache_->CoreRspPorts.at(0);
|
||||
if (!icache_rsp_port.empty()){
|
||||
auto& mem_rsp = icache_rsp_port.front();
|
||||
auto trace = pending_icache_.at(mem_rsp.tag);
|
||||
decode_latch_.push(trace);
|
||||
DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
|
||||
pending_icache_.release(mem_rsp.tag);
|
||||
icache_rsp_port.pop();
|
||||
}
|
||||
|
||||
// send icache request
|
||||
if (!fetch_latch_.empty()) {
|
||||
auto trace = fetch_latch_.front();
|
||||
MemReq mem_req;
|
||||
mem_req.addr = trace->PC;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = pending_icache_.allocate(trace);
|
||||
icache_->CoreReqPorts.at(0).send(mem_req, 1);
|
||||
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
fetch_latch_.pop();
|
||||
}
|
||||
}
|
||||
|
||||
void Core::decode(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
if (decode_latch_.empty())
|
||||
return;
|
||||
|
||||
auto trace = decode_latch_.front();
|
||||
|
||||
// check ibuffer capacity
|
||||
auto& ibuffer = ibuffers_.at(trace->wid);
|
||||
if (ibuffer.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** ibuffer-stall: " << *trace);
|
||||
}
|
||||
++perf_stats_.ibuf_stalls;
|
||||
return;
|
||||
} else {
|
||||
trace->resume();
|
||||
}
|
||||
|
||||
// release warp
|
||||
if (!trace->fetch_stall) {
|
||||
stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
|
||||
// update perf counters
|
||||
uint32_t active_threads = trace->tmask.count();
|
||||
if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::LOAD)
|
||||
perf_stats_.loads += active_threads;
|
||||
if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::STORE)
|
||||
perf_stats_.stores += active_threads;
|
||||
if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH)
|
||||
perf_stats_.branches += active_threads;
|
||||
|
||||
DT(3, cycle, "pipeline-decode: " << *trace);
|
||||
|
||||
// insert to ibuffer
|
||||
ibuffer.push(trace);
|
||||
|
||||
decode_latch_.pop();
|
||||
}
|
||||
|
||||
void Core::execute(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
// issue ibuffer instructions
|
||||
for (auto& ibuffer : ibuffers_) {
|
||||
if (ibuffer.empty())
|
||||
continue;
|
||||
|
||||
auto trace = ibuffer.top();
|
||||
|
||||
// check scoreboard
|
||||
if (scoreboard_.in_use(trace)) {
|
||||
if (!trace->suspend()) {
|
||||
DTH(3, cycle, "*** scoreboard-stall: dependents={");
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
|
||||
auto& use = uses.at(i);
|
||||
__unused (use);
|
||||
if (i) DTN(3, ", ");
|
||||
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
|
||||
}
|
||||
DTN(3, "}, " << *trace << std::endl);
|
||||
}
|
||||
++perf_stats_.scrb_stalls;
|
||||
continue;
|
||||
} else {
|
||||
trace->resume();
|
||||
}
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.reserve(trace);
|
||||
|
||||
DT(3, cycle, "pipeline-issue: " << *trace);
|
||||
|
||||
// push to execute units
|
||||
auto& exe_unit = exe_units_.at((int)trace->exe_type);
|
||||
exe_unit->Input.send(trace, 1);
|
||||
|
||||
ibuffer.pop();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void Core::commit(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
// commit completed instructions
|
||||
bool wb = false;
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
if (!exe_unit->Output.empty()) {
|
||||
auto trace = exe_unit->Output.front();
|
||||
|
||||
// allow only one commit that updates registers
|
||||
if (trace->wb && wb)
|
||||
continue;
|
||||
wb |= trace->wb;
|
||||
|
||||
// advance to commit stage
|
||||
DT(3, cycle, "pipeline-commit: " << *trace);
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.release(trace);
|
||||
|
||||
assert(committed_instrs_ <= issued_instrs_);
|
||||
++committed_instrs_;
|
||||
|
||||
perf_stats_.instrs += trace->tmask.count();
|
||||
|
||||
// delete the trace
|
||||
delete trace;
|
||||
|
||||
exe_unit->Output.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WarpMask Core::wspawn(int num_warps, int nextPC) {
|
||||
WarpMask ret(1);
|
||||
int active_warps = std::min<int>(num_warps, arch_.num_warps());
|
||||
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
|
||||
for (int i = 1; i < active_warps; ++i) {
|
||||
auto warp = warps_.at(i);
|
||||
warp->setPC(nextPC);
|
||||
warp->setTmask(0, true);
|
||||
ret.set(i);
|
||||
}
|
||||
return std::move(ret);
|
||||
}
|
||||
|
||||
WarpMask Core::barrier(int bar_id, int count, int warp_id) {
|
||||
WarpMask ret(0);
|
||||
auto& barrier = barriers_.at(bar_id);
|
||||
barrier.set(warp_id);
|
||||
if (barrier.count() < (size_t)count) {
|
||||
warps_.at(warp_id)->suspend();
|
||||
DP(3, "*** Suspend warp #" << warp_id << " at barrier #" << bar_id);
|
||||
return std::move(ret);
|
||||
}
|
||||
for (int i = 0; i < arch_.num_warps(); ++i) {
|
||||
if (barrier.test(i)) {
|
||||
DP(3, "*** Resume warp #" << i << " at barrier #" << bar_id);
|
||||
warps_.at(i)->activate();
|
||||
ret.set(i);
|
||||
}
|
||||
}
|
||||
barrier.reset();
|
||||
return std::move(ret);
|
||||
}
|
||||
|
||||
Word Core::icache_read(Addr addr, Size size) {
|
||||
Word data;
|
||||
mmu_.read(&data, addr, size, 0);
|
||||
return data;
|
||||
}
|
||||
|
||||
Word Core::dcache_read(Addr addr, Size size) {
|
||||
Word data;
|
||||
mmu_.read(&data, addr, size, 0);
|
||||
return data;
|
||||
}
|
||||
|
||||
void Core::dcache_write(Addr addr, Word data, Size size) {
|
||||
if (addr >= IO_COUT_ADDR
|
||||
&& addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {
|
||||
this->writeToStdOut(addr, data);
|
||||
} else {
|
||||
mmu_.write(&data, addr, size, 0);
|
||||
}
|
||||
}
|
||||
|
||||
Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector<mem_addr_size_t>* mem_addrs) {
|
||||
return tex_units_.at(unit).read(u, v, lod, mem_addrs);
|
||||
}
|
||||
|
||||
void Core::writeToStdOut(Addr addr, Word data) {
|
||||
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
|
||||
auto& ss_buf = print_bufs_[tid];
|
||||
char c = (char)data;
|
||||
ss_buf << c;
|
||||
if (c == '\n') {
|
||||
std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush;
|
||||
ss_buf.str("");
|
||||
}
|
||||
}
|
||||
|
||||
Word Core::get_csr(Addr addr, int tid, int wid) {
|
||||
switch (addr) {
|
||||
case CSR_SATP:
|
||||
case CSR_PMPCFG0:
|
||||
case CSR_PMPADDR0:
|
||||
case CSR_MSTATUS:
|
||||
case CSR_MISA:
|
||||
case CSR_MEDELEG:
|
||||
case CSR_MIDELEG:
|
||||
case CSR_MIE:
|
||||
case CSR_MTVEC:
|
||||
case CSR_MEPC:
|
||||
return 0;
|
||||
|
||||
case CSR_FFLAGS:
|
||||
return fcsrs_.at(wid) & 0x1F;
|
||||
case CSR_FRM:
|
||||
return (fcsrs_.at(wid) >> 5);
|
||||
case CSR_FCSR:
|
||||
return fcsrs_.at(wid);
|
||||
case CSR_WTID:
|
||||
// Warp threadID
|
||||
return tid;
|
||||
case CSR_LTID:
|
||||
// Core threadID
|
||||
return tid + (wid * arch_.num_threads());
|
||||
case CSR_GTID:
|
||||
// Processor threadID
|
||||
return tid + (wid * arch_.num_threads()) +
|
||||
(arch_.num_threads() * arch_.num_warps() * id_);
|
||||
case CSR_LWID:
|
||||
// Core warpID
|
||||
return wid;
|
||||
case CSR_GWID:
|
||||
// Processor warpID
|
||||
return wid + (arch_.num_warps() * id_);
|
||||
case CSR_GCID:
|
||||
// Processor coreID
|
||||
return id_;
|
||||
case CSR_TMASK:
|
||||
// Processor coreID
|
||||
return warps_.at(wid)->getTmask();
|
||||
case CSR_NT:
|
||||
// Number of threads per warp
|
||||
return arch_.num_threads();
|
||||
case CSR_NW:
|
||||
// Number of warps per core
|
||||
return arch_.num_warps();
|
||||
case CSR_NC:
|
||||
// Number of cores
|
||||
return arch_.num_cores();
|
||||
case CSR_MINSTRET:
|
||||
// NumInsts
|
||||
return perf_stats_.instrs & 0xffffffff;
|
||||
case CSR_MINSTRET_H:
|
||||
// NumInsts
|
||||
return (Word)(perf_stats_.instrs >> 32);
|
||||
case CSR_MCYCLE:
|
||||
// NumCycles
|
||||
return (Word)SimPlatform::instance().cycles();
|
||||
case CSR_MCYCLE_H:
|
||||
// NumCycles
|
||||
return (Word)(SimPlatform::instance().cycles() >> 32);
|
||||
case CSR_MPM_IBUF_ST:
|
||||
return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||
case CSR_MPM_IBUF_ST_H:
|
||||
return perf_stats_.ibuf_stalls >> 32;
|
||||
case CSR_MPM_SCRB_ST:
|
||||
return perf_stats_.scrb_stalls & 0xffffffff;
|
||||
case CSR_MPM_SCRB_ST_H:
|
||||
return perf_stats_.scrb_stalls >> 32;
|
||||
case CSR_MPM_ALU_ST:
|
||||
return perf_stats_.alu_stalls & 0xffffffff;
|
||||
case CSR_MPM_ALU_ST_H:
|
||||
return perf_stats_.alu_stalls >> 32;
|
||||
case CSR_MPM_LSU_ST:
|
||||
return perf_stats_.lsu_stalls & 0xffffffff;
|
||||
case CSR_MPM_LSU_ST_H:
|
||||
return perf_stats_.lsu_stalls >> 32;
|
||||
case CSR_MPM_CSR_ST:
|
||||
return perf_stats_.csr_stalls & 0xffffffff;
|
||||
case CSR_MPM_CSR_ST_H:
|
||||
return perf_stats_.csr_stalls >> 32;
|
||||
case CSR_MPM_FPU_ST:
|
||||
return perf_stats_.fpu_stalls & 0xffffffff;
|
||||
case CSR_MPM_FPU_ST_H:
|
||||
return perf_stats_.fpu_stalls >> 32;
|
||||
case CSR_MPM_GPU_ST:
|
||||
return perf_stats_.gpu_stalls & 0xffffffff;
|
||||
case CSR_MPM_GPU_ST_H:
|
||||
return perf_stats_.gpu_stalls >> 32;
|
||||
|
||||
case CSR_MPM_LOADS:
|
||||
return perf_stats_.loads & 0xffffffff;
|
||||
case CSR_MPM_LOADS_H:
|
||||
return perf_stats_.loads >> 32;
|
||||
case CSR_MPM_STORES:
|
||||
return perf_stats_.stores & 0xffffffff;
|
||||
case CSR_MPM_STORES_H:
|
||||
return perf_stats_.stores >> 32;
|
||||
case CSR_MPM_BRANCHES:
|
||||
return perf_stats_.branches & 0xffffffff;
|
||||
case CSR_MPM_BRANCHES_H:
|
||||
return perf_stats_.branches >> 32;
|
||||
|
||||
case CSR_MPM_ICACHE_READS:
|
||||
return icache_->perf_stats().reads & 0xffffffff;
|
||||
case CSR_MPM_ICACHE_READS_H:
|
||||
return icache_->perf_stats().reads >> 32;
|
||||
case CSR_MPM_ICACHE_MISS_R:
|
||||
return icache_->perf_stats().read_misses & 0xffffffff;
|
||||
case CSR_MPM_ICACHE_MISS_R_H:
|
||||
return icache_->perf_stats().read_misses >> 32;
|
||||
|
||||
case CSR_MPM_DCACHE_READS:
|
||||
return dcache_->perf_stats().reads & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_READS_H:
|
||||
return dcache_->perf_stats().reads >> 32;
|
||||
case CSR_MPM_DCACHE_WRITES:
|
||||
return dcache_->perf_stats().writes & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_WRITES_H:
|
||||
return dcache_->perf_stats().writes >> 32;
|
||||
case CSR_MPM_DCACHE_MISS_R:
|
||||
return dcache_->perf_stats().read_misses & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_MISS_R_H:
|
||||
return dcache_->perf_stats().read_misses >> 32;
|
||||
case CSR_MPM_DCACHE_MISS_W:
|
||||
return dcache_->perf_stats().write_misses & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_MISS_W_H:
|
||||
return dcache_->perf_stats().write_misses >> 32;
|
||||
case CSR_MPM_DCACHE_BANK_ST:
|
||||
return dcache_->perf_stats().bank_stalls & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_BANK_ST_H:
|
||||
return dcache_->perf_stats().bank_stalls >> 32;
|
||||
case CSR_MPM_DCACHE_MSHR_ST:
|
||||
return dcache_->perf_stats().mshr_stalls & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_MSHR_ST_H:
|
||||
return dcache_->perf_stats().mshr_stalls >> 32;
|
||||
|
||||
case CSR_MPM_SMEM_READS:
|
||||
return shared_mem_->perf_stats().reads & 0xffffffff;
|
||||
case CSR_MPM_SMEM_READS_H:
|
||||
return shared_mem_->perf_stats().reads >> 32;
|
||||
case CSR_MPM_SMEM_WRITES:
|
||||
return shared_mem_->perf_stats().writes & 0xffffffff;
|
||||
case CSR_MPM_SMEM_WRITES_H:
|
||||
return shared_mem_->perf_stats().writes >> 32;
|
||||
case CSR_MPM_SMEM_BANK_ST:
|
||||
return shared_mem_->perf_stats().bank_stalls & 0xffffffff;
|
||||
case CSR_MPM_SMEM_BANK_ST_H:
|
||||
return shared_mem_->perf_stats().bank_stalls >> 32;
|
||||
|
||||
case CSR_MPM_MEM_READS:
|
||||
return perf_stats_.mem_reads & 0xffffffff;
|
||||
case CSR_MPM_MEM_READS_H:
|
||||
return perf_stats_.mem_reads >> 32;
|
||||
case CSR_MPM_MEM_WRITES:
|
||||
return perf_stats_.mem_writes & 0xffffffff;
|
||||
case CSR_MPM_MEM_WRITES_H:
|
||||
return perf_stats_.mem_writes >> 32;
|
||||
case CSR_MPM_MEM_LAT:
|
||||
return perf_stats_.mem_latency & 0xffffffff;
|
||||
case CSR_MPM_MEM_LAT_H:
|
||||
return perf_stats_.mem_latency >> 32;
|
||||
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
case CSR_MPM_TEX_READS:
|
||||
return perf_stats_.tex_reads & 0xffffffff;
|
||||
case CSR_MPM_TEX_READS_H:
|
||||
return perf_stats_.tex_reads >> 32;
|
||||
case CSR_MPM_TEX_LAT:
|
||||
return perf_stats_.tex_latency & 0xffffffff;
|
||||
case CSR_MPM_TEX_LAT_H:
|
||||
return perf_stats_.tex_latency >> 32;
|
||||
#endif
|
||||
default:
|
||||
if ((addr >= CSR_MPM_BASE && addr < (CSR_MPM_BASE + 32))
|
||||
|| (addr >= CSR_MPM_BASE_H && addr < (CSR_MPM_BASE_H + 32))) {
|
||||
// user-defined MPM CSRs
|
||||
} else
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
if (addr == CSR_TEX_UNIT) {
|
||||
return csr_tex_unit_;
|
||||
} else
|
||||
if (addr >= CSR_TEX_STATE_BEGIN
|
||||
&& addr < CSR_TEX_STATE_END) {
|
||||
uint32_t state = CSR_TEX_STATE(addr);
|
||||
return tex_units_.at(csr_tex_unit_).get_state(state);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) {
|
||||
if (addr == CSR_FFLAGS) {
|
||||
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F);
|
||||
} else if (addr == CSR_FRM) {
|
||||
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5);
|
||||
} else if (addr == CSR_FCSR) {
|
||||
fcsrs_.at(wid) = value & 0xff;
|
||||
} else
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
if (addr == CSR_TEX_UNIT) {
|
||||
csr_tex_unit_ = value;
|
||||
} else
|
||||
if (addr >= CSR_TEX_STATE_BEGIN
|
||||
&& addr < CSR_TEX_STATE_END) {
|
||||
uint32_t state = CSR_TEX_STATE(addr);
|
||||
tex_units_.at(csr_tex_unit_).set_state(state, value);
|
||||
return;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
csrs_.at(addr) = value;
|
||||
}
|
||||
}
|
||||
|
||||
void Core::trigger_ecall() {
|
||||
ecall_ = true;
|
||||
}
|
||||
|
||||
void Core::trigger_ebreak() {
|
||||
ebreak_ = true;
|
||||
}
|
||||
|
||||
bool Core::check_exit() const {
|
||||
return ebreak_ || ecall_;
|
||||
}
|
||||
|
||||
bool Core::running() const {
|
||||
bool is_running = (committed_instrs_ != issued_instrs_);
|
||||
return is_running;
|
||||
}
|
||||
180
sim/simx/core.h
Normal file
180
sim/simx/core.h
Normal file
@@ -0,0 +1,180 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <stack>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <simobject.h>
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include "archdef.h"
|
||||
#include "decode.h"
|
||||
#include "mem.h"
|
||||
#include "warp.h"
|
||||
#include "pipeline.h"
|
||||
#include "cache.h"
|
||||
#include "sharedmem.h"
|
||||
#include "ibuffer.h"
|
||||
#include "scoreboard.h"
|
||||
#include "exeunit.h"
|
||||
#include "tex_unit.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Core : public SimObject<Core> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
uint64_t instrs;
|
||||
uint64_t ibuf_stalls;
|
||||
uint64_t scrb_stalls;
|
||||
uint64_t alu_stalls;
|
||||
uint64_t lsu_stalls;
|
||||
uint64_t csr_stalls;
|
||||
uint64_t fpu_stalls;
|
||||
uint64_t gpu_stalls;
|
||||
uint64_t loads;
|
||||
uint64_t stores;
|
||||
uint64_t branches;
|
||||
uint64_t mem_reads;
|
||||
uint64_t mem_writes;
|
||||
uint64_t mem_latency;
|
||||
uint64_t tex_reads;
|
||||
uint64_t tex_latency;
|
||||
|
||||
PerfStats()
|
||||
: instrs(0)
|
||||
, ibuf_stalls(0)
|
||||
, scrb_stalls(0)
|
||||
, alu_stalls(0)
|
||||
, lsu_stalls(0)
|
||||
, csr_stalls(0)
|
||||
, fpu_stalls(0)
|
||||
, gpu_stalls(0)
|
||||
, loads(0)
|
||||
, stores(0)
|
||||
, branches(0)
|
||||
, mem_reads(0)
|
||||
, mem_writes(0)
|
||||
, mem_latency(0)
|
||||
, tex_reads(0)
|
||||
, tex_latency(0)
|
||||
{}
|
||||
};
|
||||
|
||||
SimPort<MemRsp> MemRspPort;
|
||||
SimPort<MemReq> MemReqPort;
|
||||
|
||||
Core(const SimContext& ctx, const ArchDef &arch, Word id);
|
||||
~Core();
|
||||
|
||||
void attach_ram(RAM* ram);
|
||||
|
||||
bool running() const;
|
||||
|
||||
void step(uint64_t cycle);
|
||||
|
||||
Word id() const {
|
||||
return id_;
|
||||
}
|
||||
|
||||
Warp& warp(int i) {
|
||||
return *warps_.at(i);
|
||||
}
|
||||
|
||||
const Decoder& decoder() {
|
||||
return decoder_;
|
||||
}
|
||||
|
||||
const ArchDef& arch() const {
|
||||
return arch_;
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
Word getIRegValue(int reg) const {
|
||||
return warps_.at(0)->getIRegValue(reg);
|
||||
}
|
||||
|
||||
Word get_csr(Addr addr, int tid, int wid);
|
||||
|
||||
void set_csr(Addr addr, Word value, int tid, int wid);
|
||||
|
||||
WarpMask wspawn(int num_warps, int nextPC);
|
||||
|
||||
WarpMask barrier(int bar_id, int count, int warp_id);
|
||||
|
||||
Word icache_read(Addr, Size);
|
||||
|
||||
Word dcache_read(Addr, Size);
|
||||
|
||||
void dcache_write(Addr, Word, Size);
|
||||
|
||||
Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector<mem_addr_size_t>* mem_addrs);
|
||||
|
||||
void trigger_ecall();
|
||||
|
||||
void trigger_ebreak();
|
||||
|
||||
bool check_exit() const;
|
||||
|
||||
private:
|
||||
|
||||
void schedule(uint64_t cycle);
|
||||
void fetch(uint64_t cycle);
|
||||
void decode(uint64_t cycle);
|
||||
void execute(uint64_t cycle);
|
||||
void commit(uint64_t cycle);
|
||||
|
||||
void writeToStdOut(Addr addr, Word data);
|
||||
|
||||
Word id_;
|
||||
const ArchDef arch_;
|
||||
const Decoder decoder_;
|
||||
MemoryUnit mmu_;
|
||||
std::vector<TexUnit> tex_units_;
|
||||
|
||||
std::vector<std::shared_ptr<Warp>> warps_;
|
||||
std::vector<WarpMask> barriers_;
|
||||
std::vector<Word> csrs_;
|
||||
std::vector<Byte> fcsrs_;
|
||||
std::vector<IBuffer> ibuffers_;
|
||||
Scoreboard scoreboard_;
|
||||
std::vector<ExeUnit::Ptr> exe_units_;
|
||||
Cache::Ptr icache_;
|
||||
Cache::Ptr dcache_;
|
||||
SharedMem::Ptr shared_mem_;
|
||||
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
|
||||
|
||||
PipelineLatch fetch_latch_;
|
||||
PipelineLatch decode_latch_;
|
||||
|
||||
HashTable<pipeline_trace_t*> pending_icache_;
|
||||
WarpMask active_warps_;
|
||||
WarpMask stalled_warps_;
|
||||
uint32_t last_schedule_wid_;
|
||||
uint64_t issued_instrs_;
|
||||
uint64_t committed_instrs_;
|
||||
uint32_t csr_tex_unit_;
|
||||
bool ecall_;
|
||||
bool ebreak_;
|
||||
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
|
||||
PerfStats perf_stats_;
|
||||
uint64_t perf_mem_pending_reads_;
|
||||
|
||||
friend class LsuUnit;
|
||||
friend class AluUnit;
|
||||
friend class CsrUnit;
|
||||
friend class FpuUnit;
|
||||
friend class GpuUnit;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
65
sim/simx/debug.h
Normal file
65
sim/simx/debug.h
Normal file
@@ -0,0 +1,65 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef DEBUG_LEVEL
|
||||
#define DEBUG_LEVEL 3
|
||||
#endif
|
||||
|
||||
#define DEBUG_HEADER << "DEBUG "
|
||||
//#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
|
||||
|
||||
#define TRACE_HEADER << "TRACE "
|
||||
//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
#define DP(lvl, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout DEBUG_HEADER << x << std::endl; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DPH(lvl, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout DEBUG_HEADER << x; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DPN(lvl, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout << x; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DT(lvl, t, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DTH(lvl, t, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DTN(lvl, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout << x; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#else
|
||||
|
||||
#define DP(lvl, x) do {} while(0)
|
||||
#define DPH(lvl, x) do {} while(0)
|
||||
#define DPN(lvl, x) do {} while(0)
|
||||
|
||||
#define DT(lvl, t, x) do {} while(0)
|
||||
#define DTH(lvl, t, x) do {} while(0)
|
||||
#define DTN(lvl, x) do {} while(0)
|
||||
|
||||
#endif
|
||||
495
sim/simx/decode.cpp
Normal file
495
sim/simx/decode.cpp
Normal file
@@ -0,0 +1,495 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <util.h>
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include "decode.h"
|
||||
#include "archdef.h"
|
||||
#include "instr.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
struct InstTableEntry_t {
|
||||
bool controlFlow;
|
||||
InstType iType;
|
||||
};
|
||||
|
||||
static const std::unordered_map<int, struct InstTableEntry_t> sc_instTable = {
|
||||
{Opcode::NOP, {false, InstType::N_TYPE}},
|
||||
{Opcode::R_INST, {false, InstType::R_TYPE}},
|
||||
{Opcode::L_INST, {false, InstType::I_TYPE}},
|
||||
{Opcode::I_INST, {false, InstType::I_TYPE}},
|
||||
{Opcode::S_INST, {false, InstType::S_TYPE}},
|
||||
{Opcode::B_INST, {true , InstType::B_TYPE}},
|
||||
{Opcode::LUI_INST, {false, InstType::U_TYPE}},
|
||||
{Opcode::AUIPC_INST, {false, InstType::U_TYPE}},
|
||||
{Opcode::JAL_INST, {true , InstType::J_TYPE}},
|
||||
{Opcode::JALR_INST, {true , InstType::I_TYPE}},
|
||||
{Opcode::SYS_INST, {true , InstType::I_TYPE}},
|
||||
{Opcode::FENCE, {true , InstType::I_TYPE}},
|
||||
{Opcode::FL, {false, InstType::I_TYPE}},
|
||||
{Opcode::FS, {false, InstType::S_TYPE}},
|
||||
{Opcode::FCI, {false, InstType::R_TYPE}},
|
||||
{Opcode::FMADD, {false, InstType::R4_TYPE}},
|
||||
{Opcode::FMSUB, {false, InstType::R4_TYPE}},
|
||||
{Opcode::FMNMADD, {false, InstType::R4_TYPE}},
|
||||
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
|
||||
{Opcode::VSET, {false, InstType::V_TYPE}},
|
||||
{Opcode::GPGPU, {false, InstType::R_TYPE}},
|
||||
{Opcode::GPU, {false, InstType::R4_TYPE}},
|
||||
};
|
||||
|
||||
static const char* op_string(const Instr &instr) {
|
||||
auto opcode = instr.getOpcode();
|
||||
Word func2 = instr.getFunc2();
|
||||
Word func3 = instr.getFunc3();
|
||||
Word func7 = instr.getFunc7();
|
||||
Word rs2 = instr.getRSrc(1);
|
||||
Word imm = instr.getImm();
|
||||
|
||||
switch (opcode) {
|
||||
case Opcode::NOP: return "NOP";
|
||||
case Opcode::LUI_INST: return "LUI";
|
||||
case Opcode::AUIPC_INST: return "AUIPC";
|
||||
case Opcode::R_INST:
|
||||
if (func7 & 0x1) {
|
||||
switch (func3) {
|
||||
case 0: return "MUL";
|
||||
case 1: return "MULH";
|
||||
case 2: return "MULHSU";
|
||||
case 3: return "MULHU";
|
||||
case 4: return "DIV";
|
||||
case 5: return "DIVU";
|
||||
case 6: return "REM";
|
||||
case 7: return "REMU";
|
||||
}
|
||||
} else {
|
||||
switch (func3) {
|
||||
case 0: return func7 ? "SUB" : "ADD";
|
||||
case 1: return "SLL";
|
||||
case 2: return "SLT";
|
||||
case 3: return "SLTU";
|
||||
case 4: return "XOR";
|
||||
case 5: return func7 ? "SRA" : "SRL";
|
||||
case 6: return "OR";
|
||||
case 7: return "AND";
|
||||
}
|
||||
}
|
||||
case Opcode::I_INST:
|
||||
switch (func3) {
|
||||
case 0: return "ADDI";
|
||||
case 1: return "SLLI";
|
||||
case 2: return "SLTI";
|
||||
case 3: return "SLTIU";
|
||||
case 4: return "XORI";
|
||||
case 5: return func7 ? "SRAI" : "SRLI";
|
||||
case 6: return "ORI";
|
||||
case 7: return "ANDI";
|
||||
}
|
||||
case Opcode::B_INST:
|
||||
switch (func3) {
|
||||
case 0: return "BEQ";
|
||||
case 1: return "BNE";
|
||||
case 4: return "BLT";
|
||||
case 5: return "BGE";
|
||||
case 6: return "BLTU";
|
||||
case 7: return "BGEU";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case Opcode::JAL_INST: return "JAL";
|
||||
case Opcode::JALR_INST: return "JALR";
|
||||
case Opcode::L_INST:
|
||||
switch (func3) {
|
||||
case 0: return "LBI";
|
||||
case 1: return "LHI";
|
||||
case 2: return "LW";
|
||||
case 4: return "LBU";
|
||||
case 5: return "LHU";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case Opcode::S_INST:
|
||||
switch (func3) {
|
||||
case 0: return "SB";
|
||||
case 1: return "SH";
|
||||
case 2: return "SW";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case Opcode::SYS_INST:
|
||||
switch (func3) {
|
||||
case 0:
|
||||
switch (imm) {
|
||||
case 0x000: return "ECALL";
|
||||
case 0x001: return "EBREAK";
|
||||
case 0x002: return "URET";
|
||||
case 0x102: return "SRET";
|
||||
case 0x302: return "MRET";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case 1: return "CSRRW";
|
||||
case 2: return "CSRRS";
|
||||
case 3: return "CSRRC";
|
||||
case 5: return "CSRRWI";
|
||||
case 6: return "CSRRSI";
|
||||
case 7: return "CSRRCI";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case Opcode::FENCE: return "FENCE";
|
||||
case Opcode::FL: return (func3 == 0x2) ? "FL" : "VL";
|
||||
case Opcode::FS: return (func3 == 0x2) ? "FS" : "VS";
|
||||
case Opcode::FCI:
|
||||
switch (func7) {
|
||||
case 0x00: return "FADD";
|
||||
case 0x04: return "FSUB";
|
||||
case 0x08: return "FMUL";
|
||||
case 0x0c: return "FDIV";
|
||||
case 0x2c: return "FSQRT";
|
||||
case 0x10:
|
||||
switch (func3) {
|
||||
case 0: return "FSGNJ";
|
||||
case 1: return "FSGNJN";
|
||||
case 2: return "FSGNJX";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case 0x14:
|
||||
switch (func3) {
|
||||
case 0: return "FMIM";
|
||||
case 1: return "FMAX";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case 0x50:
|
||||
switch (func3) {
|
||||
case 0: return "FLE";
|
||||
case 1: return "FLT";
|
||||
case 2: return "FEQ";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case 0x60: return rs2 ? "FCVT.WU.S" : "FCVT.W.S";
|
||||
case 0x68: return rs2 ? "FCVT.S.WU" : "FCVT.S.W";
|
||||
case 0x70: return func3 ? "FLASS" : "FMV.X.W";
|
||||
case 0x78: return "FMV.W.X";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case Opcode::FMADD: return "FMADD";
|
||||
case Opcode::FMSUB: return "FMSUB";
|
||||
case Opcode::FMNMADD: return "FMNMADD";
|
||||
case Opcode::FMNMSUB: return "FMNMSUB";
|
||||
case Opcode::VSET: return "VSET";
|
||||
case Opcode::GPGPU:
|
||||
switch (func3) {
|
||||
case 0: return "TMC";
|
||||
case 1: return "WSPAWN";
|
||||
case 2: return "SPLIT";
|
||||
case 3: return "JOIN";
|
||||
case 4: return "BAR";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case Opcode::GPU:
|
||||
switch (func3) {
|
||||
case 0: return "TEX";
|
||||
case 1: {
|
||||
switch (func2) {
|
||||
case 0: return "CMOV";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
namespace vortex {
|
||||
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
|
||||
auto opcode = instr.getOpcode();
|
||||
Word func2 = instr.getFunc2();
|
||||
Word func3 = instr.getFunc3();
|
||||
|
||||
os << op_string(instr) << ": ";
|
||||
|
||||
if (opcode == S_INST
|
||||
|| opcode == FS) {
|
||||
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
|
||||
os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
|
||||
} else
|
||||
if (opcode == L_INST
|
||||
|| opcode == FL) {
|
||||
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
|
||||
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
|
||||
} else {
|
||||
if (instr.getRDType() != RegType::None) {
|
||||
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
|
||||
}
|
||||
int i = 0;
|
||||
for (; i < instr.getNRSrc(); ++i) {
|
||||
if (i) os << ", ";
|
||||
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
|
||||
}
|
||||
if (instr.hasImm()) {
|
||||
if (i) os << ", ";
|
||||
os << "imm=0x" << std::hex << instr.getImm();
|
||||
}
|
||||
if (opcode == GPU && func3 == 0) {
|
||||
os << ", unit=" << std::dec << func2;
|
||||
}
|
||||
}
|
||||
return os;
|
||||
}
|
||||
}
|
||||
|
||||
Decoder::Decoder(const ArchDef &arch) {
|
||||
inst_s_ = arch.wsize() * 8;
|
||||
opcode_s_ = 7;
|
||||
reg_s_ = 5;
|
||||
func2_s_ = 2;
|
||||
func3_s_ = 3;
|
||||
mop_s_ = 3;
|
||||
vmask_s_ = 1;
|
||||
|
||||
shift_opcode_ = 0;
|
||||
shift_rd_ = opcode_s_;
|
||||
shift_func3_ = shift_rd_ + reg_s_;
|
||||
shift_rs1_ = shift_func3_ + func3_s_;
|
||||
shift_rs2_ = shift_rs1_ + reg_s_;
|
||||
shift_func2_ = shift_rs2_ + reg_s_;
|
||||
shift_func7_ = shift_rs2_ + reg_s_;
|
||||
shift_rs3_ = shift_func7_ + func2_s_;
|
||||
shift_vmop_ = shift_func7_ + vmask_s_;
|
||||
shift_vnf_ = shift_vmop_ + mop_s_;
|
||||
shift_func6_ = shift_func7_ + 1;
|
||||
shift_vset_ = shift_func7_ + 6;
|
||||
|
||||
reg_mask_ = 0x1f;
|
||||
func2_mask_ = 0x3;
|
||||
func3_mask_ = 0x7;
|
||||
func6_mask_ = 0x3f;
|
||||
func7_mask_ = 0x7f;
|
||||
opcode_mask_ = 0x7f;
|
||||
i_imm_mask_ = 0xfff;
|
||||
s_imm_mask_ = 0xfff;
|
||||
b_imm_mask_ = 0x1fff;
|
||||
u_imm_mask_ = 0xfffff;
|
||||
j_imm_mask_ = 0xfffff;
|
||||
v_imm_mask_ = 0x7ff;
|
||||
}
|
||||
|
||||
std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
||||
auto instr = std::make_shared<Instr>();
|
||||
Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
|
||||
instr->setOpcode(op);
|
||||
|
||||
Word func2 = (code >> shift_func2_) & func2_mask_;
|
||||
Word func3 = (code >> shift_func3_) & func3_mask_;
|
||||
Word func6 = (code >> shift_func6_) & func6_mask_;
|
||||
Word func7 = (code >> shift_func7_) & func7_mask_;
|
||||
|
||||
int rd = (code >> shift_rd_) & reg_mask_;
|
||||
int rs1 = (code >> shift_rs1_) & reg_mask_;
|
||||
int rs2 = (code >> shift_rs2_) & reg_mask_;
|
||||
int rs3 = (code >> shift_rs3_) & reg_mask_;
|
||||
|
||||
auto op_it = sc_instTable.find(op);
|
||||
if (op_it == sc_instTable.end()) {
|
||||
std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto iType = op_it->second.iType;
|
||||
if (op == Opcode::FL || op == Opcode::FS) {
|
||||
if (func3 != 0x2) {
|
||||
iType = InstType::V_TYPE;
|
||||
}
|
||||
}
|
||||
|
||||
switch (iType) {
|
||||
case InstType::N_TYPE:
|
||||
break;
|
||||
|
||||
case InstType::R_TYPE:
|
||||
if (op == Opcode::FCI) {
|
||||
switch (func7) {
|
||||
case 0x68: // FCVT.S.W, FCVT.S.WU
|
||||
case 0x78: // FMV.W.X
|
||||
instr->setSrcReg(rs1);
|
||||
break;
|
||||
default:
|
||||
instr->setSrcFReg(rs1);
|
||||
}
|
||||
instr->setSrcFReg(rs2);
|
||||
switch (func7) {
|
||||
case 0x50: // FLE, FLT, FEQ
|
||||
case 0x60: // FCVT.WU.S, FCVT.W.S
|
||||
case 0x70: // FLASS, FMV.X.W
|
||||
instr->setDestReg(rd);
|
||||
break;
|
||||
default:
|
||||
instr->setDestFReg(rd);
|
||||
}
|
||||
} else {
|
||||
instr->setDestReg(rd);
|
||||
instr->setSrcReg(rs1);
|
||||
instr->setSrcReg(rs2);
|
||||
}
|
||||
instr->setFunc3(func3);
|
||||
instr->setFunc7(func7);
|
||||
break;
|
||||
|
||||
case InstType::I_TYPE: {
|
||||
instr->setSrcReg(rs1);
|
||||
if (op == Opcode::FL) {
|
||||
instr->setDestFReg(rd);
|
||||
} else {
|
||||
instr->setDestReg(rd);
|
||||
}
|
||||
instr->setFunc3(func3);
|
||||
instr->setFunc7(func7);
|
||||
switch (op) {
|
||||
case Opcode::SYS_INST:
|
||||
case Opcode::FENCE:
|
||||
// uint12
|
||||
instr->setImm(code >> shift_rs2_);
|
||||
break;
|
||||
case Opcode::I_INST:
|
||||
if (func3 == 0x1 || func3 == 0x5) {
|
||||
// int5
|
||||
instr->setImm(sext32(rs2, 5));
|
||||
} else {
|
||||
// int12
|
||||
instr->setImm(sext32(code >> shift_rs2_, 12));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// int12
|
||||
instr->setImm(sext32(code >> shift_rs2_, 12));
|
||||
break;
|
||||
}
|
||||
} break;
|
||||
case InstType::S_TYPE: {
|
||||
instr->setSrcReg(rs1);
|
||||
if (op == Opcode::FS) {
|
||||
instr->setSrcFReg(rs2);
|
||||
} else {
|
||||
instr->setSrcReg(rs2);
|
||||
}
|
||||
instr->setFunc3(func3);
|
||||
Word imm = (func7 << reg_s_) | rd;
|
||||
instr->setImm(sext32(imm, 12));
|
||||
} break;
|
||||
|
||||
case InstType::B_TYPE: {
|
||||
instr->setSrcReg(rs1);
|
||||
instr->setSrcReg(rs2);
|
||||
instr->setFunc3(func3);
|
||||
Word bit_11 = rd & 0x1;
|
||||
Word bits_4_1 = rd >> 1;
|
||||
Word bit_10_5 = func7 & 0x3f;
|
||||
Word bit_12 = func7 >> 6;
|
||||
Word imm = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12);
|
||||
instr->setImm(sext32(imm, 13));
|
||||
} break;
|
||||
|
||||
case InstType::U_TYPE:
|
||||
instr->setDestReg(rd);
|
||||
instr->setImm(sext32(code >> shift_func3_, 20));
|
||||
break;
|
||||
|
||||
case InstType::J_TYPE: {
|
||||
instr->setDestReg(rd);
|
||||
Word unordered = code >> shift_func3_;
|
||||
Word bits_19_12 = unordered & 0xff;
|
||||
Word bit_11 = (unordered >> 8) & 0x1;
|
||||
Word bits_10_1 = (unordered >> 9) & 0x3ff;
|
||||
Word bit_20 = (unordered >> 19) & 0x1;
|
||||
Word imm = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
|
||||
if (bit_20) {
|
||||
imm |= ~j_imm_mask_;
|
||||
}
|
||||
instr->setImm(imm);
|
||||
} break;
|
||||
|
||||
case InstType::V_TYPE:
|
||||
switch (op) {
|
||||
case Opcode::VSET: {
|
||||
instr->setDestVReg(rd);
|
||||
instr->setSrcVReg(rs1);
|
||||
instr->setFunc3(func3);
|
||||
if (func3 == 7) {
|
||||
instr->setImm(!(code >> shift_vset_));
|
||||
if (instr->getImm()) {
|
||||
Word immed = (code >> shift_rs2_) & v_imm_mask_;
|
||||
instr->setImm(immed);
|
||||
instr->setVlmul(immed & 0x3);
|
||||
instr->setVediv((immed >> 4) & 0x3);
|
||||
instr->setVsew((immed >> 2) & 0x3);
|
||||
} else {
|
||||
instr->setSrcVReg(rs2);
|
||||
}
|
||||
} else {
|
||||
instr->setSrcVReg(rs2);
|
||||
instr->setVmask((code >> shift_func7_) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
}
|
||||
} break;
|
||||
|
||||
case Opcode::FL:
|
||||
instr->setDestVReg(rd);
|
||||
instr->setSrcVReg(rs1);
|
||||
instr->setVlsWidth(func3);
|
||||
instr->setSrcVReg(rs2);
|
||||
instr->setVmask(code >> shift_func7_);
|
||||
instr->setVmop((code >> shift_vmop_) & func3_mask_);
|
||||
instr->setVnf((code >> shift_vnf_) & func3_mask_);
|
||||
break;
|
||||
|
||||
case Opcode::FS:
|
||||
instr->setVs3(rd);
|
||||
instr->setSrcVReg(rs1);
|
||||
instr->setVlsWidth(func3);
|
||||
instr->setSrcVReg(rs2);
|
||||
instr->setVmask(code >> shift_func7_);
|
||||
instr->setVmop((code >> shift_vmop_) & func3_mask_);
|
||||
instr->setVnf((code >> shift_vnf_) & func3_mask_);
|
||||
break;
|
||||
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
break;
|
||||
case R4_TYPE:
|
||||
if (op == Opcode::GPU) {
|
||||
instr->setDestReg(rd);
|
||||
instr->setSrcReg(rs1);
|
||||
instr->setSrcReg(rs2);
|
||||
instr->setSrcReg(rs3);
|
||||
} else {
|
||||
instr->setDestFReg(rd);
|
||||
instr->setSrcFReg(rs1);
|
||||
instr->setSrcFReg(rs2);
|
||||
instr->setSrcFReg(rs3);
|
||||
}
|
||||
instr->setFunc2(func2);
|
||||
instr->setFunc3(func3);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
||||
return instr;
|
||||
}
|
||||
61
sim/simx/decode.h
Normal file
61
sim/simx/decode.h
Normal file
@@ -0,0 +1,61 @@
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class ArchDef;
|
||||
class Instr;
|
||||
class Pipeline;
|
||||
|
||||
class Decoder {
|
||||
public:
|
||||
Decoder(const ArchDef &);
|
||||
|
||||
std::shared_ptr<Instr> decode(Word code) const;
|
||||
|
||||
private:
|
||||
|
||||
Word inst_s_;
|
||||
Word opcode_s_;
|
||||
Word reg_s_;
|
||||
Word func2_s_;
|
||||
Word func3_s_;
|
||||
Word shift_opcode_;
|
||||
Word shift_rd_;
|
||||
Word shift_rs1_;
|
||||
Word shift_rs2_;
|
||||
Word shift_rs3_;
|
||||
Word shift_func2_;
|
||||
Word shift_func3_;
|
||||
Word shift_func7_;
|
||||
Word shift_j_u_immed_;
|
||||
Word shift_s_b_immed_;
|
||||
Word shift_i_immed_;
|
||||
|
||||
Word reg_mask_;
|
||||
Word func2_mask_;
|
||||
Word func3_mask_;
|
||||
Word func6_mask_;
|
||||
Word func7_mask_;
|
||||
Word opcode_mask_;
|
||||
Word i_imm_mask_;
|
||||
Word s_imm_mask_;
|
||||
Word b_imm_mask_;
|
||||
Word u_imm_mask_;
|
||||
Word j_imm_mask_;
|
||||
Word v_imm_mask_;
|
||||
|
||||
//Vector
|
||||
Word shift_vset_;
|
||||
Word shift_vset_immed_;
|
||||
Word shift_vmask_;
|
||||
Word shift_vmop_;
|
||||
Word shift_vnf_;
|
||||
Word shift_func6_;
|
||||
Word vmask_s_;
|
||||
Word mop_s_;
|
||||
};
|
||||
|
||||
}
|
||||
1839
sim/simx/execute.cpp
Normal file
1839
sim/simx/execute.cpp
Normal file
File diff suppressed because it is too large
Load Diff
371
sim/simx/exeunit.cpp
Normal file
371
sim/simx/exeunit.cpp
Normal file
@@ -0,0 +1,371 @@
|
||||
#include "exeunit.h"
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <util.h>
|
||||
#include "debug.h"
|
||||
#include "core.h"
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
|
||||
|
||||
void NopUnit::step(uint64_t /*cycle*/) {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
Output.send(trace, 1);
|
||||
Input.pop();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
|
||||
: ExeUnit(ctx, core, "LSU")
|
||||
, num_threads_(core->arch().num_threads())
|
||||
, pending_dcache_(LSUQ_SIZE)
|
||||
, fence_lock_(false)
|
||||
{}
|
||||
|
||||
void LsuUnit::step(uint64_t cycle) {
|
||||
// handle dcache response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
Output.send(trace, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
}
|
||||
|
||||
// handle shared memory response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
|
||||
if (smem_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = smem_rsp_port.front();
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
Output.send(trace, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
}
|
||||
smem_rsp_port.pop();
|
||||
}
|
||||
|
||||
if (fence_lock_) {
|
||||
// wait for all pending memory operations to complete
|
||||
if (!pending_dcache_.empty())
|
||||
return;
|
||||
Output.send(fence_state_, 1);
|
||||
fence_lock_ = false;
|
||||
DT(3, cycle, "fence-unlock: " << fence_state_);
|
||||
}
|
||||
|
||||
// check input queue
|
||||
if (Input.empty())
|
||||
return;
|
||||
|
||||
auto trace = Input.front();
|
||||
|
||||
if (trace->lsu.type == LsuType::FENCE) {
|
||||
// schedule fence lock
|
||||
fence_state_ = trace;
|
||||
fence_lock_ = true;
|
||||
DT(3, cycle, "fence-lock: " << *trace);
|
||||
// remove input
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.lsu_stalls += (cycle - time);
|
||||
return;
|
||||
}
|
||||
|
||||
// check pending queue capacity
|
||||
if (pending_dcache_.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
trace->resume();
|
||||
}
|
||||
|
||||
bool is_write = (trace->lsu.type == LsuType::STORE);
|
||||
|
||||
// duplicates detection
|
||||
bool is_dup = false;
|
||||
if (trace->tmask.test(0)) {
|
||||
uint64_t addr_mask = sizeof(Word)-1;
|
||||
Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
|
||||
uint32_t matches = 1;
|
||||
for (uint32_t t = 1; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
|
||||
matches += (addr0 == mem_addr);
|
||||
}
|
||||
is_dup = (matches == trace->tmask.count());
|
||||
}
|
||||
|
||||
uint32_t valid_addrs = 0;
|
||||
if (is_dup) {
|
||||
valid_addrs = 1;
|
||||
} else {
|
||||
for (auto& mem_addr : trace->mem_addrs) {
|
||||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
}
|
||||
|
||||
auto tag = pending_dcache_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
|
||||
auto mem_addr = trace->mem_addrs.at(t).at(0);
|
||||
auto type = get_addr_type(mem_addr.addr, mem_addr.size);
|
||||
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.tag = tag;
|
||||
mem_req.is_io = (type == AddrType::IO);
|
||||
|
||||
if (type == AddrType::Shared) {
|
||||
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
|
||||
DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
|
||||
} else {
|
||||
dcache_req_port.send(mem_req, 2);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace);
|
||||
}
|
||||
|
||||
if (is_dup)
|
||||
break;
|
||||
}
|
||||
|
||||
// do not wait on writes
|
||||
if (is_write) {
|
||||
pending_dcache_.release(tag);
|
||||
Output.send(trace, 1);
|
||||
}
|
||||
|
||||
// remove input
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.lsu_stalls += (cycle - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
|
||||
|
||||
void AluUnit::step(uint64_t cycle) {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
switch (trace->alu.type) {
|
||||
case AluType::ARITH:
|
||||
case AluType::BRANCH:
|
||||
case AluType::CMOV:
|
||||
Output.send(trace, 1);
|
||||
break;
|
||||
case AluType::IMUL:
|
||||
Output.send(trace, LATENCY_IMUL+1);
|
||||
break;
|
||||
case AluType::IDIV:
|
||||
Output.send(trace, XLEN+1);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
|
||||
if (trace->fetch_stall) {
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.alu_stalls += (cycle - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
|
||||
|
||||
void CsrUnit::step(uint64_t cycle) {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
Output.send(trace, 1);
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.csr_stalls += (cycle - time);
|
||||
DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
|
||||
|
||||
void FpuUnit::step(uint64_t cycle) {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
switch (trace->fpu.type) {
|
||||
case FpuType::FNCP:
|
||||
Output.send(trace, 2);
|
||||
break;
|
||||
case FpuType::FMA:
|
||||
Output.send(trace, LATENCY_FMA+1);
|
||||
break;
|
||||
case FpuType::FDIV:
|
||||
Output.send(trace, LATENCY_FDIV+1);
|
||||
break;
|
||||
case FpuType::FSQRT:
|
||||
Output.send(trace, LATENCY_FSQRT+1);
|
||||
break;
|
||||
case FpuType::FCVT:
|
||||
Output.send(trace, LATENCY_FCVT+1);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.fpu_stalls += (cycle - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
|
||||
: ExeUnit(ctx, core, "GPU")
|
||||
, num_threads_(core->arch().num_threads())
|
||||
, pending_tex_reqs_(TEXQ_SIZE)
|
||||
{}
|
||||
|
||||
void GpuUnit::step(uint64_t cycle) {
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// handle memory response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
Output.send(trace, 1);
|
||||
pending_tex_reqs_.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
}
|
||||
#endif
|
||||
|
||||
// check input queue
|
||||
if (Input.empty())
|
||||
return;
|
||||
|
||||
auto trace = Input.front();
|
||||
|
||||
bool issued = false;
|
||||
|
||||
switch (trace->gpu.type) {
|
||||
case GpuType::TMC:
|
||||
Output.send(trace, 1);
|
||||
core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
|
||||
issued = true;
|
||||
break;
|
||||
case GpuType::WSPAWN:
|
||||
Output.send(trace, 1);
|
||||
core_->active_warps_ = trace->gpu.active_warps;
|
||||
issued = true;
|
||||
break;
|
||||
case GpuType::SPLIT:
|
||||
case GpuType::JOIN:
|
||||
Output.send(trace, 1);
|
||||
issued = true;
|
||||
break;
|
||||
case GpuType::BAR:
|
||||
Output.send(trace, 1);
|
||||
if (trace->gpu.active_warps != 0)
|
||||
core_->active_warps_ |= trace->gpu.active_warps;
|
||||
else
|
||||
core_->active_warps_.reset(trace->wid);
|
||||
issued = true;
|
||||
break;
|
||||
case GpuType::TEX:
|
||||
if (this->processTexRequest(cycle, trace))
|
||||
issued = true;
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
||||
if (issued) {
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
|
||||
if (trace->fetch_stall) {
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.fpu_stalls += (cycle - time);
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
|
||||
__unused (cycle);
|
||||
|
||||
// check pending queue capacity
|
||||
if (pending_tex_reqs_.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** tex-queue-stall: " << *trace);
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
trace->resume();
|
||||
}
|
||||
|
||||
// send memory request
|
||||
|
||||
uint32_t valid_addrs = 0;
|
||||
for (auto& mem_addr : trace->mem_addrs) {
|
||||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
|
||||
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
|
||||
for (auto mem_addr : trace->mem_addrs.at(t)) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
mem_req.write = (trace->lsu.type == LsuType::STORE);
|
||||
mem_req.tag = tag;
|
||||
dcache_req_port.send(mem_req, 3);
|
||||
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", tid=" << t << ", "<< trace);
|
||||
++ core_->perf_stats_.tex_reads;
|
||||
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
95
sim/simx/exeunit.h
Normal file
95
sim/simx/exeunit.h
Normal file
@@ -0,0 +1,95 @@
|
||||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include "pipeline.h"
|
||||
#include "cache.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Core;
|
||||
|
||||
class ExeUnit : public SimObject<ExeUnit> {
|
||||
public:
|
||||
SimPort<pipeline_trace_t*> Input;
|
||||
SimPort<pipeline_trace_t*> Output;
|
||||
|
||||
ExeUnit(const SimContext& ctx, Core* core, const char* name)
|
||||
: SimObject<ExeUnit>(ctx, name)
|
||||
, Input(this)
|
||||
, Output(this)
|
||||
, core_(core)
|
||||
{}
|
||||
|
||||
virtual ~ExeUnit() {}
|
||||
|
||||
protected:
|
||||
Core* core_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class NopUnit : public ExeUnit {
|
||||
public:
|
||||
NopUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class LsuUnit : public ExeUnit {
|
||||
private:
|
||||
uint32_t num_threads_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
|
||||
pipeline_trace_t* fence_state_;
|
||||
bool fence_lock_;
|
||||
|
||||
public:
|
||||
LsuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class AluUnit : public ExeUnit {
|
||||
public:
|
||||
AluUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class CsrUnit : public ExeUnit {
|
||||
public:
|
||||
CsrUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class FpuUnit : public ExeUnit {
|
||||
public:
|
||||
FpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class GpuUnit : public ExeUnit {
|
||||
private:
|
||||
uint32_t num_threads_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
|
||||
|
||||
bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
|
||||
|
||||
public:
|
||||
GpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
}
|
||||
39
sim/simx/ibuffer.h
Normal file
39
sim/simx/ibuffer.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
#include "pipeline.h"
|
||||
#include <queue>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class IBuffer {
|
||||
private:
|
||||
std::queue<pipeline_trace_t*> entries_;
|
||||
uint32_t capacity_;
|
||||
|
||||
public:
|
||||
IBuffer(uint32_t size)
|
||||
: capacity_(size)
|
||||
{}
|
||||
|
||||
bool empty() const {
|
||||
return entries_.empty();
|
||||
}
|
||||
|
||||
bool full() const {
|
||||
return (entries_.size() == capacity_);
|
||||
}
|
||||
|
||||
pipeline_trace_t* top() const {
|
||||
return entries_.front();
|
||||
}
|
||||
|
||||
void push(pipeline_trace_t* trace) {
|
||||
entries_.emplace(trace);
|
||||
}
|
||||
|
||||
void pop() {
|
||||
return entries_.pop();
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
140
sim/simx/instr.h
Normal file
140
sim/simx/instr.h
Normal file
@@ -0,0 +1,140 @@
|
||||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Warp;
|
||||
|
||||
enum Opcode {
|
||||
NOP = 0,
|
||||
R_INST = 0x33,
|
||||
L_INST = 0x3,
|
||||
I_INST = 0x13,
|
||||
S_INST = 0x23,
|
||||
B_INST = 0x63,
|
||||
LUI_INST = 0x37,
|
||||
AUIPC_INST= 0x17,
|
||||
JAL_INST = 0x6f,
|
||||
JALR_INST = 0x67,
|
||||
SYS_INST = 0x73,
|
||||
FENCE = 0x0f,
|
||||
// F Extension
|
||||
FL = 0x7,
|
||||
FS = 0x27,
|
||||
FCI = 0x53,
|
||||
FMADD = 0x43,
|
||||
FMSUB = 0x47,
|
||||
FMNMSUB = 0x4b,
|
||||
FMNMADD = 0x4f,
|
||||
// Vector Extension
|
||||
VSET = 0x57,
|
||||
// GPGPU Extension
|
||||
GPGPU = 0x6b,
|
||||
GPU = 0x5b,
|
||||
};
|
||||
|
||||
enum InstType {
|
||||
N_TYPE,
|
||||
R_TYPE,
|
||||
I_TYPE,
|
||||
S_TYPE,
|
||||
B_TYPE,
|
||||
U_TYPE,
|
||||
J_TYPE,
|
||||
V_TYPE,
|
||||
R4_TYPE
|
||||
};
|
||||
|
||||
class Instr {
|
||||
public:
|
||||
Instr()
|
||||
: opcode_(Opcode::NOP)
|
||||
, num_rsrcs_(0)
|
||||
, has_imm_(false)
|
||||
, rdest_type_(RegType::None)
|
||||
, rdest_(0)
|
||||
, func3_(0)
|
||||
, func7_(0) {
|
||||
for (int i = 0; i < MAX_REG_SOURCES; ++i) {
|
||||
rsrc_type_[i] = RegType::None;
|
||||
}
|
||||
}
|
||||
|
||||
/* Setters used to "craft" the instruction. */
|
||||
void setOpcode(Opcode opcode) { opcode_ = opcode; }
|
||||
void setDestReg(int destReg) { rdest_type_ = RegType::Integer; rdest_ = destReg; }
|
||||
void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Integer; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setDestFReg(int destReg) { rdest_type_ = RegType::Float; rdest_ = destReg; }
|
||||
void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
|
||||
void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setFunc2(Word func2) { func2_ = func2; }
|
||||
void setFunc3(Word func3) { func3_ = func3; }
|
||||
void setFunc7(Word func7) { func7_ = func7; }
|
||||
void setImm(Word imm) { has_imm_ = true; imm_ = imm; }
|
||||
void setVlsWidth(Word width) { vlsWidth_ = width; }
|
||||
void setVmop(Word mop) { vMop_ = mop; }
|
||||
void setVnf(Word nf) { vNf_ = nf; }
|
||||
void setVmask(Word mask) { vmask_ = mask; }
|
||||
void setVs3(Word vs) { vs3_ = vs; }
|
||||
void setVlmul(Word lmul) { vlmul_ = 1 << lmul; }
|
||||
void setVsew(Word sew) { vsew_ = 1 << (3+sew); }
|
||||
void setVediv(Word ediv) { vediv_ = 1 << ediv; }
|
||||
void setFunc6(Word func6) { func6_ = func6; }
|
||||
|
||||
/* Getters used by encoders. */
|
||||
Opcode getOpcode() const { return opcode_; }
|
||||
Word getFunc2() const { return func2_; }
|
||||
Word getFunc3() const { return func3_; }
|
||||
Word getFunc6() const { return func6_; }
|
||||
Word getFunc7() const { return func7_; }
|
||||
int getNRSrc() const { return num_rsrcs_; }
|
||||
int getRSrc(int i) const { return rsrc_[i]; }
|
||||
RegType getRSType(int i) const { return rsrc_type_[i]; }
|
||||
int getRDest() const { return rdest_; }
|
||||
RegType getRDType() const { return rdest_type_; }
|
||||
bool hasImm() const { return has_imm_; }
|
||||
Word getImm() const { return imm_; }
|
||||
Word getVlsWidth() const { return vlsWidth_; }
|
||||
Word getVmop() const { return vMop_; }
|
||||
Word getvNf() const { return vNf_; }
|
||||
Word getVmask() const { return vmask_; }
|
||||
Word getVs3() const { return vs3_; }
|
||||
Word getVlmul() const { return vlmul_; }
|
||||
Word getVsew() const { return vsew_; }
|
||||
Word getVediv() const { return vediv_; }
|
||||
|
||||
private:
|
||||
|
||||
enum {
|
||||
MAX_REG_SOURCES = 3
|
||||
};
|
||||
|
||||
Opcode opcode_;
|
||||
int num_rsrcs_;
|
||||
bool has_imm_;
|
||||
RegType rdest_type_;
|
||||
Word imm_;
|
||||
RegType rsrc_type_[MAX_REG_SOURCES];
|
||||
int rsrc_[MAX_REG_SOURCES];
|
||||
int rdest_;
|
||||
Word func2_;
|
||||
Word func3_;
|
||||
Word func6_;
|
||||
|
||||
// Vector
|
||||
Word vmask_;
|
||||
Word vlsWidth_;
|
||||
Word vMop_;
|
||||
Word vNf_;
|
||||
Word vs3_;
|
||||
Word vlmul_;
|
||||
Word vsew_;
|
||||
Word vediv_;
|
||||
Word func7_;
|
||||
|
||||
friend std::ostream &operator<<(std::ostream &, const Instr&);
|
||||
};
|
||||
|
||||
}
|
||||
97
sim/simx/main.cpp
Normal file
97
sim/simx/main.cpp
Normal file
@@ -0,0 +1,97 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <stdlib.h>
|
||||
#include <sys/stat.h>
|
||||
#include "processor.h"
|
||||
#include <util.h>
|
||||
#include "args.h"
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int exitcode;
|
||||
|
||||
std::string archStr("rv32imf");
|
||||
std::string imgFileName;
|
||||
int num_cores(NUM_CORES * NUM_CLUSTERS);
|
||||
int num_warps(NUM_WARPS);
|
||||
int num_threads(NUM_THREADS);
|
||||
bool showHelp(false);
|
||||
bool showStats(false);
|
||||
bool riscv_test(false);
|
||||
|
||||
/* Read the command line arguments. */
|
||||
CommandLineArgFlag fh("-h", "--help", "", showHelp);
|
||||
CommandLineArgSetter<std::string> fa("-a", "--arch", "", archStr);
|
||||
CommandLineArgSetter<std::string> fi("-i", "--image", "", imgFileName);
|
||||
CommandLineArgSetter<int> fc("-c", "--cores", "", num_cores);
|
||||
CommandLineArgSetter<int> fw("-w", "--warps", "", num_warps);
|
||||
CommandLineArgSetter<int> ft("-t", "--threads", "", num_threads);
|
||||
CommandLineArgFlag fr("-r", "--riscv", "", riscv_test);
|
||||
CommandLineArgFlag fs("-s", "--stats", "", showStats);
|
||||
|
||||
CommandLineArg::readArgs(argc - 1, argv + 1);
|
||||
|
||||
if (showHelp || imgFileName.empty()) {
|
||||
std::cout << "Vortex emulator command line arguments:\n"
|
||||
" -i, --image <filename> Program RAM image\n"
|
||||
" -c, --cores <num> Number of cores\n"
|
||||
" -w, --warps <num> Number of warps\n"
|
||||
" -t, --threads <num> Number of threads\n"
|
||||
" -a, --arch <arch string> Architecture string\n"
|
||||
" -r, --riscv riscv test\n"
|
||||
" -s, --stats Print stats on exit.\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::cout << "Running " << imgFileName << "..." << std::endl;
|
||||
|
||||
if (!SimPlatform::instance().initialize())
|
||||
return -1;
|
||||
|
||||
{
|
||||
ArchDef arch(archStr, num_cores, num_warps, num_threads);
|
||||
|
||||
Processor processor(arch);
|
||||
|
||||
RAM ram(RAM_PAGE_SIZE);
|
||||
|
||||
{
|
||||
std::string program_ext(fileExtension(imgFileName.c_str()));
|
||||
if (program_ext == "bin") {
|
||||
ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
|
||||
} else if (program_ext == "hex") {
|
||||
ram.loadHexImage(imgFileName.c_str());
|
||||
} else {
|
||||
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
processor.attach_ram(&ram);
|
||||
|
||||
exitcode = processor.run();
|
||||
|
||||
if (riscv_test) {
|
||||
if (1 == exitcode) {
|
||||
std::cout << "Passed." << std::endl;
|
||||
exitcode = 0;
|
||||
} else {
|
||||
std::cout << "Failed." << std::endl;
|
||||
}
|
||||
} else {
|
||||
if (exitcode != 0) {
|
||||
std::cout << "*** error: exitcode=" << exitcode << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SimPlatform::instance().finalize();
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
62
sim/simx/memsim.cpp
Normal file
62
sim/simx/memsim.cpp
Normal file
@@ -0,0 +1,62 @@
|
||||
#include "memsim.h"
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
class MemSim::Impl {
|
||||
private:
|
||||
MemSim* simobject_;
|
||||
uint32_t num_banks_;
|
||||
uint32_t latency_;
|
||||
PerfStats perf_stats_;
|
||||
|
||||
public:
|
||||
Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency)
|
||||
: simobject_(simobject)
|
||||
, num_banks_(num_banks)
|
||||
, latency_(latency)
|
||||
{}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
|
||||
auto& mem_req_port = simobject_->MemReqPorts.at(i);
|
||||
if (mem_req_port.empty())
|
||||
continue;
|
||||
auto& mem_req = mem_req_port.front();
|
||||
if (!mem_req.write) {
|
||||
MemRsp mem_rsp;
|
||||
mem_rsp.tag = mem_req.tag;
|
||||
simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
|
||||
++perf_stats_.reads;
|
||||
} else {
|
||||
++perf_stats_.writes;
|
||||
}
|
||||
mem_req_port.pop();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
MemSim::MemSim(const SimContext& ctx,
|
||||
uint32_t num_banks,
|
||||
uint32_t latency)
|
||||
: SimObject<MemSim>(ctx, "MemSim")
|
||||
, MemReqPorts(num_banks, this)
|
||||
, MemRspPorts(num_banks, this)
|
||||
, impl_(new Impl(this, num_banks, latency))
|
||||
{}
|
||||
|
||||
MemSim::~MemSim() {
|
||||
delete impl_;
|
||||
}
|
||||
|
||||
void MemSim::step(uint64_t cycle) {
|
||||
impl_->step(cycle);
|
||||
}
|
||||
36
sim/simx/memsim.h
Normal file
36
sim/simx/memsim.h
Normal file
@@ -0,0 +1,36 @@
|
||||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include "types.h"
|
||||
#include <vector>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class MemSim : public SimObject<MemSim>{
|
||||
public:
|
||||
struct PerfStats {
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
|
||||
PerfStats()
|
||||
: reads(0)
|
||||
, writes(0)
|
||||
{}
|
||||
};
|
||||
|
||||
std::vector<SimPort<MemReq>> MemReqPorts;
|
||||
std::vector<SimPort<MemRsp>> MemRspPorts;
|
||||
|
||||
MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency);
|
||||
~MemSim();
|
||||
|
||||
void step(uint64_t cycle);
|
||||
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
Impl* impl_;
|
||||
};
|
||||
|
||||
};
|
||||
111
sim/simx/pipeline.h
Normal file
111
sim/simx/pipeline.h
Normal file
@@ -0,0 +1,111 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
#include <util.h>
|
||||
#include "types.h"
|
||||
#include "archdef.h"
|
||||
#include "debug.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
struct pipeline_trace_t {
|
||||
//--
|
||||
uint64_t uuid;
|
||||
|
||||
//--
|
||||
int cid;
|
||||
int wid;
|
||||
ThreadMask tmask;
|
||||
Word PC;
|
||||
|
||||
//--
|
||||
bool fetch_stall;
|
||||
|
||||
//--
|
||||
bool wb;
|
||||
RegType rdest_type;
|
||||
int rdest;
|
||||
|
||||
//--
|
||||
RegMask used_iregs;
|
||||
RegMask used_fregs;
|
||||
RegMask used_vregs;
|
||||
|
||||
//-
|
||||
ExeType exe_type;
|
||||
|
||||
//--
|
||||
std::vector<std::vector<mem_addr_size_t>> mem_addrs;
|
||||
|
||||
//--
|
||||
union {
|
||||
struct {
|
||||
LsuType type;
|
||||
} lsu;
|
||||
struct {
|
||||
AluType type;
|
||||
} alu;
|
||||
struct {
|
||||
FpuType type;
|
||||
} fpu;
|
||||
struct {
|
||||
GpuType type;
|
||||
WarpMask active_warps;
|
||||
} gpu;
|
||||
};
|
||||
|
||||
bool stalled;
|
||||
|
||||
pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
|
||||
uuid = uuid_;
|
||||
cid = 0;
|
||||
wid = 0;
|
||||
tmask.reset();
|
||||
PC = 0;
|
||||
fetch_stall = false;
|
||||
wb = false;
|
||||
rdest = 0;
|
||||
rdest_type = RegType::None;
|
||||
used_iregs.reset();
|
||||
used_fregs.reset();
|
||||
used_vregs.reset();
|
||||
exe_type = ExeType::NOP;
|
||||
mem_addrs.resize(arch.num_threads());
|
||||
stalled = false;
|
||||
}
|
||||
|
||||
bool suspend() {
|
||||
bool old = stalled;
|
||||
stalled = true;
|
||||
return old;
|
||||
}
|
||||
|
||||
void resume() {
|
||||
stalled = false;
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
|
||||
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
|
||||
os << ", wb=" << state.wb;
|
||||
if (state.wb) {
|
||||
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
|
||||
}
|
||||
os << ", ex=" << state.exe_type;
|
||||
os << " (#" << std::dec << state.uuid << ")";
|
||||
return os;
|
||||
}
|
||||
|
||||
class PipelineLatch : public Queue<pipeline_trace_t*> {
|
||||
protected:
|
||||
const char* name_;
|
||||
|
||||
public:
|
||||
PipelineLatch(const char* name = nullptr)
|
||||
: name_(name)
|
||||
{}
|
||||
};
|
||||
|
||||
}
|
||||
145
sim/simx/processor.cpp
Normal file
145
sim/simx/processor.cpp
Normal file
@@ -0,0 +1,145 @@
|
||||
#include "processor.h"
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Processor::Processor(const ArchDef& arch)
|
||||
: cores_(arch.num_cores())
|
||||
, l2caches_(NUM_CLUSTERS)
|
||||
, l2_mem_switches_(NUM_CLUSTERS)
|
||||
{
|
||||
uint32_t num_cores = arch.num_cores();
|
||||
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
|
||||
|
||||
// create cores
|
||||
for (uint32_t i = 0; i < num_cores; ++i) {
|
||||
cores_.at(i) = Core::Create(arch, i);
|
||||
}
|
||||
|
||||
// connect memory sub-systen
|
||||
memsim_ = MemSim::Create(1, MEM_LATENCY);
|
||||
std::vector<SimPort<MemReq>*> mem_req_ports(1);
|
||||
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);
|
||||
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
|
||||
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
|
||||
|
||||
if (L3_ENABLE) {
|
||||
l3cache_ = Cache::Create("l3cache", Cache::Config{
|
||||
log2ceil(L3_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
L3_NUM_BANKS, // number of banks
|
||||
L3_NUM_PORTS, // number of ports
|
||||
NUM_CLUSTERS, // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
L3_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
}
|
||||
);
|
||||
|
||||
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
|
||||
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
|
||||
|
||||
mem_req_ports.resize(NUM_CLUSTERS);
|
||||
mem_rsp_ports.resize(NUM_CLUSTERS);
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
|
||||
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
|
||||
}
|
||||
} else if (NUM_CLUSTERS > 1) {
|
||||
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
|
||||
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
|
||||
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
|
||||
|
||||
mem_req_ports.resize(NUM_CLUSTERS);
|
||||
mem_rsp_ports.resize(NUM_CLUSTERS);
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
|
||||
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
if (L2_ENABLE) {
|
||||
auto& l2cache = l2caches_.at(i);
|
||||
l2cache = Cache::Create("l2cache", Cache::Config{
|
||||
log2ceil(L2_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
L2_NUM_BANKS, // number of banks
|
||||
L2_NUM_PORTS, // number of ports
|
||||
NUM_CORES, // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
L2_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
|
||||
l2cache->MemReqPort.bind(mem_req_ports.at(i));
|
||||
|
||||
mem_req_ports.resize(cores_per_cluster);
|
||||
mem_rsp_ports.resize(cores_per_cluster);
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
|
||||
mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
|
||||
}
|
||||
} else if (cores_per_cluster > 1) {
|
||||
auto& l2_mem_switch = l2_mem_switches_.at(i);
|
||||
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
|
||||
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
|
||||
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
|
||||
|
||||
mem_req_ports.resize(cores_per_cluster);
|
||||
mem_rsp_ports.resize(cores_per_cluster);
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
|
||||
mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
auto& core = cores_.at((i * NUM_CLUSTERS) + j);
|
||||
mem_rsp_ports.at(i)->bind(&core->MemRspPort);
|
||||
core->MemReqPort.bind(mem_req_ports.at(j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Processor::attach_ram(RAM* ram) {
|
||||
for (auto core : cores_) {
|
||||
core->attach_ram(ram);
|
||||
}
|
||||
}
|
||||
|
||||
Processor::~Processor() {}
|
||||
|
||||
int Processor::run() {
|
||||
bool running;
|
||||
int exitcode = 0;
|
||||
do {
|
||||
SimPlatform::instance().step();
|
||||
|
||||
running = false;
|
||||
for (auto& core : cores_) {
|
||||
if (core->running()) {
|
||||
running = true;
|
||||
}
|
||||
if (core->check_exit()) {
|
||||
exitcode = core->getIRegValue(3);
|
||||
running = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (running);
|
||||
|
||||
std::cout << std::flush;
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
27
sim/simx/processor.h
Normal file
27
sim/simx/processor.h
Normal file
@@ -0,0 +1,27 @@
|
||||
#pragma once
|
||||
|
||||
#include "core.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Processor {
|
||||
public:
|
||||
typedef std::shared_ptr<Processor> Ptr;
|
||||
|
||||
Processor(const ArchDef& arch);
|
||||
~Processor();
|
||||
|
||||
void attach_ram(RAM* mem);
|
||||
|
||||
int run();
|
||||
|
||||
private:
|
||||
std::vector<Core::Ptr> cores_;
|
||||
std::vector<Cache::Ptr> l2caches_;
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
|
||||
Cache::Ptr l3cache_;
|
||||
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
|
||||
MemSim::Ptr memsim_;
|
||||
};
|
||||
|
||||
}
|
||||
123
sim/simx/scoreboard.h
Normal file
123
sim/simx/scoreboard.h
Normal file
@@ -0,0 +1,123 @@
|
||||
#pragma once
|
||||
|
||||
#include "pipeline.h"
|
||||
#include <queue>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Scoreboard {
|
||||
private:
|
||||
struct reg_use_t {
|
||||
RegType type;
|
||||
uint32_t reg;
|
||||
uint64_t owner;
|
||||
};
|
||||
|
||||
std::vector<RegMask> in_use_iregs_;
|
||||
std::vector<RegMask> in_use_fregs_;
|
||||
std::vector<RegMask> in_use_vregs_;
|
||||
std::unordered_map<uint32_t, uint64_t> owners_;
|
||||
|
||||
public:
|
||||
Scoreboard(const ArchDef &arch)
|
||||
: in_use_iregs_(arch.num_warps())
|
||||
, in_use_fregs_(arch.num_warps())
|
||||
, in_use_vregs_(arch.num_warps())
|
||||
{
|
||||
for (int w = 0; w < arch.num_warps(); ++w) {
|
||||
in_use_iregs_.at(w).reset();
|
||||
in_use_fregs_.at(w).reset();
|
||||
in_use_vregs_.at(w).reset();
|
||||
}
|
||||
}
|
||||
|
||||
bool in_use(pipeline_trace_t* state) const {
|
||||
return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0
|
||||
|| (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
|
||||
|| (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
|
||||
}
|
||||
|
||||
std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
|
||||
std::vector<reg_use_t> out;
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);
|
||||
while (used_iregs.any()) {
|
||||
if (used_iregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
|
||||
out.push_back({RegType::Integer, r, owners_.at(tag)});
|
||||
}
|
||||
used_iregs >>= 1;
|
||||
++r;
|
||||
}
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
|
||||
while (used_fregs.any()) {
|
||||
if (used_fregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
|
||||
out.push_back({RegType::Float, r, owners_.at(tag)});
|
||||
}
|
||||
used_fregs >>= 1;
|
||||
++r;
|
||||
}
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
|
||||
while (used_vregs.any()) {
|
||||
if (used_vregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
|
||||
out.push_back({RegType::Vector, r, owners_.at(tag)});
|
||||
}
|
||||
used_vregs >>= 1;
|
||||
++r;
|
||||
}
|
||||
}
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
void reserve(pipeline_trace_t* state) {
|
||||
if (!state->wb)
|
||||
return;
|
||||
switch (state->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state->wid).set(state->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state->wid).set(state->rdest);
|
||||
break;
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state->wid).set(state->rdest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
assert(owners_.count(tag) == 0);
|
||||
owners_[tag] = state->uuid;
|
||||
}
|
||||
|
||||
void release(pipeline_trace_t* state) {
|
||||
if (!state->wb)
|
||||
return;
|
||||
switch (state->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state->wid).reset(state->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state->wid).reset(state->rdest);
|
||||
break;
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state->wid).reset(state->rdest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
owners_.erase(tag);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
93
sim/simx/sharedmem.h
Normal file
93
sim/simx/sharedmem.h
Normal file
@@ -0,0 +1,93 @@
|
||||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include <bitmanip.h>
|
||||
#include <vector>
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Core;
|
||||
|
||||
class SharedMem : public SimObject<SharedMem> {
|
||||
public:
|
||||
struct Config {
|
||||
uint32_t num_reqs;
|
||||
uint32_t num_banks;
|
||||
uint32_t bank_offset;
|
||||
uint32_t latency;
|
||||
bool write_reponse;
|
||||
};
|
||||
|
||||
struct PerfStats {
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
uint64_t bank_stalls;
|
||||
|
||||
PerfStats()
|
||||
: reads(0)
|
||||
, writes(0)
|
||||
, bank_stalls(0)
|
||||
{}
|
||||
};
|
||||
|
||||
std::vector<SimPort<MemReq>> Inputs;
|
||||
std::vector<SimPort<MemRsp>> Outputs;
|
||||
|
||||
SharedMem(const SimContext& ctx, const char* name, const Config& config)
|
||||
: SimObject<SharedMem>(ctx, name)
|
||||
, Inputs(config.num_reqs, this)
|
||||
, Outputs(config.num_reqs, this)
|
||||
, config_(config)
|
||||
, bank_sel_addr_start_(config.bank_offset)
|
||||
, bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
|
||||
{}
|
||||
|
||||
virtual ~SharedMem() {}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
std::vector<bool> in_used_banks(config_.num_banks);
|
||||
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
|
||||
auto& core_req_port = this->Inputs.at(req_id);
|
||||
if (core_req_port.empty())
|
||||
continue;
|
||||
|
||||
auto& core_req = core_req_port.front();
|
||||
|
||||
uint32_t bank_id = (uint32_t)bit_getw(
|
||||
core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
|
||||
|
||||
// bank conflict check
|
||||
if (in_used_banks.at(bank_id))
|
||||
continue;
|
||||
|
||||
in_used_banks.at(bank_id) = true;
|
||||
|
||||
if (!core_req.write || config_.write_reponse) {
|
||||
// send response
|
||||
MemRsp core_rsp;
|
||||
core_rsp.tag = core_req.tag;
|
||||
this->Outputs.at(req_id).send(core_rsp, 1);
|
||||
}
|
||||
|
||||
// update perf counters
|
||||
perf_stats_.reads += !core_req.write;
|
||||
perf_stats_.writes += core_req.write;
|
||||
|
||||
// remove input
|
||||
core_req_port.pop();
|
||||
}
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
protected:
|
||||
Config config_;
|
||||
uint32_t bank_sel_addr_start_;
|
||||
uint32_t bank_sel_addr_end_;
|
||||
PerfStats perf_stats_;
|
||||
};
|
||||
|
||||
}
|
||||
92
sim/simx/tex_unit.cpp
Normal file
92
sim/simx/tex_unit.cpp
Normal file
@@ -0,0 +1,92 @@
|
||||
#include "tex_unit.h"
|
||||
#include "core.h"
|
||||
#include <texturing.h>
|
||||
#include <VX_config.h>
|
||||
|
||||
using namespace vortex;
|
||||
using namespace cocogfx;
|
||||
|
||||
enum class FilterMode {
|
||||
Point,
|
||||
Bilinear,
|
||||
Trilinear,
|
||||
};
|
||||
|
||||
TexUnit::TexUnit(Core* core) : core_(core) {}
|
||||
|
||||
TexUnit::~TexUnit() {}
|
||||
|
||||
uint32_t TexUnit::get_state(uint32_t state) {
|
||||
return states_.at(state);
|
||||
}
|
||||
|
||||
void TexUnit::set_state(uint32_t state, uint32_t value) {
|
||||
states_.at(state) = value;
|
||||
}
|
||||
|
||||
uint32_t TexUnit::read(int32_t u,
|
||||
int32_t v,
|
||||
int32_t lod,
|
||||
std::vector<mem_addr_size_t>* mem_addrs) {
|
||||
//--
|
||||
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
|
||||
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
|
||||
uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
|
||||
uint32_t log_width = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
|
||||
uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
|
||||
auto format = (TexFormat)states_.at(TEX_STATE_FORMAT);
|
||||
auto filter = (FilterMode)states_.at(TEX_STATE_FILTER);
|
||||
auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU);
|
||||
auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV);
|
||||
|
||||
auto stride = Stride(format);
|
||||
|
||||
switch (filter) {
|
||||
case FilterMode::Bilinear: {
|
||||
// addressing
|
||||
uint32_t offset00, offset01, offset10, offset11;
|
||||
uint32_t alpha, beta;
|
||||
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
|
||||
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
|
||||
|
||||
uint32_t addr00 = base_addr + offset00 * stride;
|
||||
uint32_t addr01 = base_addr + offset01 * stride;
|
||||
uint32_t addr10 = base_addr + offset10 * stride;
|
||||
uint32_t addr11 = base_addr + offset11 * stride;
|
||||
|
||||
// memory lookup
|
||||
uint32_t texel00 = core_->dcache_read(addr00, stride);
|
||||
uint32_t texel01 = core_->dcache_read(addr01, stride);
|
||||
uint32_t texel10 = core_->dcache_read(addr10, stride);
|
||||
uint32_t texel11 = core_->dcache_read(addr11, stride);
|
||||
|
||||
mem_addrs->push_back({addr00, stride});
|
||||
mem_addrs->push_back({addr01, stride});
|
||||
mem_addrs->push_back({addr10, stride});
|
||||
mem_addrs->push_back({addr11, stride});
|
||||
|
||||
// filtering
|
||||
auto color = TexFilterLinear(
|
||||
format, texel00, texel01, texel10, texel11, alpha, beta);
|
||||
return color;
|
||||
}
|
||||
case FilterMode::Point: {
|
||||
// addressing
|
||||
uint32_t offset;
|
||||
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
|
||||
|
||||
uint32_t addr = base_addr + offset * stride;
|
||||
|
||||
// memory lookup
|
||||
uint32_t texel = core_->dcache_read(addr, stride);
|
||||
mem_addrs->push_back({addr, stride});
|
||||
|
||||
// filtering
|
||||
auto color = TexFilterPoint(format, texel);
|
||||
return color;
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
26
sim/simx/tex_unit.h
Normal file
26
sim/simx/tex_unit.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Core;
|
||||
|
||||
class TexUnit {
|
||||
public:
|
||||
TexUnit(Core* core);
|
||||
~TexUnit();
|
||||
|
||||
uint32_t get_state(uint32_t state);
|
||||
|
||||
void set_state(uint32_t state, uint32_t value);
|
||||
|
||||
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
|
||||
|
||||
private:
|
||||
|
||||
std::array<uint32_t, NUM_TEX_STATES> states_;
|
||||
Core* core_;
|
||||
};
|
||||
|
||||
}
|
||||
415
sim/simx/types.h
Normal file
415
sim/simx/types.h
Normal file
@@ -0,0 +1,415 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <bitset>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <util.h>
|
||||
#include <VX_config.h>
|
||||
#include <simobject.h>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
typedef uint8_t Byte;
|
||||
typedef uint32_t Word;
|
||||
typedef int32_t WordI;
|
||||
|
||||
typedef uint32_t Addr;
|
||||
typedef uint32_t Size;
|
||||
|
||||
typedef std::bitset<32> RegMask;
|
||||
typedef std::bitset<32> ThreadMask;
|
||||
typedef std::bitset<32> WarpMask;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class RegType {
|
||||
None,
|
||||
Integer,
|
||||
Float,
|
||||
Vector
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
|
||||
switch (type) {
|
||||
case RegType::None: break;
|
||||
case RegType::Integer: os << "r"; break;
|
||||
case RegType::Float: os << "fr"; break;
|
||||
case RegType::Vector: os << "vr"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class ExeType {
|
||||
NOP,
|
||||
ALU,
|
||||
LSU,
|
||||
CSR,
|
||||
FPU,
|
||||
GPU,
|
||||
MAX,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
|
||||
switch (type) {
|
||||
case ExeType::NOP: os << "NOP"; break;
|
||||
case ExeType::ALU: os << "ALU"; break;
|
||||
case ExeType::LSU: os << "LSU"; break;
|
||||
case ExeType::CSR: os << "CSR"; break;
|
||||
case ExeType::FPU: os << "FPU"; break;
|
||||
case ExeType::GPU: os << "GPU"; break;
|
||||
case ExeType::MAX: break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class AluType {
|
||||
ARITH,
|
||||
BRANCH,
|
||||
IMUL,
|
||||
IDIV,
|
||||
CMOV,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
||||
switch (type) {
|
||||
case AluType::ARITH: os << "ARITH"; break;
|
||||
case AluType::BRANCH: os << "BRANCH"; break;
|
||||
case AluType::IMUL: os << "IMUL"; break;
|
||||
case AluType::IDIV: os << "IDIV"; break;
|
||||
case AluType::CMOV: os << "CMOV"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class LsuType {
|
||||
LOAD,
|
||||
STORE,
|
||||
FENCE,
|
||||
PREFETCH,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
||||
switch (type) {
|
||||
case LsuType::LOAD: os << "LOAD"; break;
|
||||
case LsuType::STORE: os << "STORE"; break;
|
||||
case LsuType::FENCE: os << "FENCE"; break;
|
||||
case LsuType::PREFETCH: os << "PREFETCH"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class AddrType {
|
||||
Global,
|
||||
Shared,
|
||||
IO,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
|
||||
switch (type) {
|
||||
case AddrType::Global: os << "Global"; break;
|
||||
case AddrType::Shared: os << "Shared"; break;
|
||||
case AddrType::IO: os << "IO"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct mem_addr_size_t {
|
||||
uint64_t addr;
|
||||
uint32_t size;
|
||||
};
|
||||
|
||||
inline AddrType get_addr_type(Word addr, uint32_t size) {
|
||||
__unused (size);
|
||||
if (SM_ENABLE) {
|
||||
if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
|
||||
&& addr < SMEM_BASE_ADDR) {
|
||||
assert((addr + size) <= SMEM_BASE_ADDR);
|
||||
return AddrType::Shared;
|
||||
}
|
||||
}
|
||||
if (addr >= IO_BASE_ADDR) {
|
||||
return AddrType::IO;
|
||||
}
|
||||
return AddrType::Global;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class FpuType {
|
||||
FNCP,
|
||||
FMA,
|
||||
FDIV,
|
||||
FSQRT,
|
||||
FCVT,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
|
||||
switch (type) {
|
||||
case FpuType::FNCP: os << "FNCP"; break;
|
||||
case FpuType::FMA: os << "FMA"; break;
|
||||
case FpuType::FDIV: os << "FDIV"; break;
|
||||
case FpuType::FSQRT: os << "FSQRT"; break;
|
||||
case FpuType::FCVT: os << "FCVT"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class GpuType {
|
||||
TMC,
|
||||
WSPAWN,
|
||||
SPLIT,
|
||||
JOIN,
|
||||
BAR,
|
||||
TEX,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
|
||||
switch (type) {
|
||||
case GpuType::TMC: os << "TMC"; break;
|
||||
case GpuType::WSPAWN: os << "WSPAWN"; break;
|
||||
case GpuType::SPLIT: os << "SPLIT"; break;
|
||||
case GpuType::JOIN: os << "JOIN"; break;
|
||||
case GpuType::BAR: os << "BAR"; break;
|
||||
case GpuType::TEX: os << "TEX"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class ArbiterType {
|
||||
Priority,
|
||||
RoundRobin
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
|
||||
switch (type) {
|
||||
case ArbiterType::Priority: os << "Priority"; break;
|
||||
case ArbiterType::RoundRobin: os << "RoundRobin"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct MemReq {
|
||||
uint64_t addr;
|
||||
uint32_t tag;
|
||||
bool write;
|
||||
bool is_io;
|
||||
|
||||
MemReq(uint64_t _addr = 0,
|
||||
uint64_t _tag = 0,
|
||||
bool _write = false,
|
||||
bool _is_io = false
|
||||
) : addr(_addr)
|
||||
, tag(_tag)
|
||||
, write(_write)
|
||||
, is_io(_is_io)
|
||||
{}
|
||||
};
|
||||
|
||||
struct MemRsp {
|
||||
uint64_t tag;
|
||||
MemRsp(uint64_t _tag = 0) : tag (_tag) {}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
class Queue {
|
||||
protected:
|
||||
std::queue<T> queue_;
|
||||
|
||||
public:
|
||||
Queue() {}
|
||||
|
||||
bool empty() const {
|
||||
return queue_.empty();
|
||||
}
|
||||
|
||||
const T& front() const {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
T& front() {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
const T& back() const {
|
||||
return queue_.back();
|
||||
}
|
||||
|
||||
T& back() {
|
||||
return queue_.back();
|
||||
}
|
||||
|
||||
void push(const T& value) {
|
||||
queue_.push(value);
|
||||
}
|
||||
|
||||
void pop() {
|
||||
queue_.pop();
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
class HashTable {
|
||||
private:
|
||||
std::vector<std::pair<bool, T>> entries_;
|
||||
uint32_t size_;
|
||||
|
||||
public:
|
||||
HashTable(uint32_t capacity)
|
||||
: entries_(capacity)
|
||||
, size_(0)
|
||||
{}
|
||||
|
||||
bool empty() const {
|
||||
return (0 == size_);
|
||||
}
|
||||
|
||||
bool full() const {
|
||||
return (size_ == entries_.size());
|
||||
}
|
||||
|
||||
uint32_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
bool contains(uint32_t index) const {
|
||||
return entries_.at(index).first;
|
||||
}
|
||||
|
||||
const T& at(uint32_t index) const {
|
||||
auto& entry = entries_.at(index);
|
||||
assert(entry.first);
|
||||
return entry.second;
|
||||
}
|
||||
|
||||
T& at(uint32_t index) {
|
||||
auto& entry = entries_.at(index);
|
||||
assert(entry.first);
|
||||
return entry.second;
|
||||
}
|
||||
|
||||
uint32_t allocate(const T& value) {
|
||||
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
|
||||
auto& entry = entries_.at(i);
|
||||
if (!entry.first) {
|
||||
entry.first = true;
|
||||
entry.second = value;
|
||||
++size_;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
}
|
||||
|
||||
void release(uint32_t index) {
|
||||
auto& entry = entries_.at(index);
|
||||
assert(entry.first);
|
||||
entry.first = false;
|
||||
--size_;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
|
||||
class Switch : public SimObject<Switch<Req, Rsp>> {
|
||||
private:
|
||||
ArbiterType type_;
|
||||
uint32_t delay_;
|
||||
uint32_t cursor_;
|
||||
uint32_t tag_shift_;
|
||||
|
||||
public:
|
||||
Switch(
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
uint32_t delay = 1
|
||||
)
|
||||
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
|
||||
, type_(type)
|
||||
, delay_(delay)
|
||||
, cursor_(0)
|
||||
, tag_shift_(log2ceil(num_inputs))
|
||||
, ReqIn(num_inputs, this)
|
||||
, ReqOut(this)
|
||||
, RspIn(this)
|
||||
, RspOut(num_inputs, this)
|
||||
{
|
||||
assert(delay_ != 0);
|
||||
assert(num_inputs <= MaxInputs);
|
||||
if (num_inputs == 1) {
|
||||
// bypass
|
||||
ReqIn.at(0).bind(&ReqOut);
|
||||
RspIn.bind(&RspOut.at(0));
|
||||
}
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
if (ReqIn.size() == 1)
|
||||
return;
|
||||
|
||||
// process incomming requests
|
||||
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
|
||||
uint32_t j = (cursor_ + i) % n;
|
||||
auto& req_in = ReqIn.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
if (tag_shift_) {
|
||||
req.tag = (req.tag << tag_shift_) | j;
|
||||
}
|
||||
ReqOut.send(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_cursor(j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// process incoming reponses
|
||||
if (!RspIn.empty()) {
|
||||
auto& rsp = RspIn.front();
|
||||
uint32_t port_id = 0;
|
||||
if (tag_shift_) {
|
||||
port_id = rsp.tag & ((1 << tag_shift_)-1);
|
||||
rsp.tag >>= tag_shift_;
|
||||
}
|
||||
RspOut.at(port_id).send(rsp, 1);
|
||||
RspIn.pop();
|
||||
}
|
||||
}
|
||||
|
||||
void update_cursor(uint32_t grant) {
|
||||
if (type_ == ArbiterType::RoundRobin) {
|
||||
cursor_ = grant + 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<SimPort<Req>> ReqIn;
|
||||
SimPort<Req> ReqOut;
|
||||
SimPort<Rsp> RspIn;
|
||||
std::vector<SimPort<Rsp>> RspOut;
|
||||
};
|
||||
|
||||
}
|
||||
62
sim/simx/warp.cpp
Normal file
62
sim/simx/warp.cpp
Normal file
@@ -0,0 +1,62 @@
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <util.h>
|
||||
|
||||
#include "instr.h"
|
||||
#include "core.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Warp::Warp(Core *core, Word id)
|
||||
: id_(id)
|
||||
, core_(core)
|
||||
, active_(false)
|
||||
, PC_(STARTUP_ADDR)
|
||||
, tmask_(0) {
|
||||
iRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
|
||||
fRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
|
||||
vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
|
||||
}
|
||||
|
||||
void Warp::eval(pipeline_trace_t *trace) {
|
||||
assert(tmask_.any());
|
||||
|
||||
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
|
||||
for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
|
||||
DPN(2, tmask_.test(n-i-1));
|
||||
DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
|
||||
|
||||
/* Fetch and decode. */
|
||||
|
||||
Word instr_code = core_->icache_read(PC_, sizeof(Word));
|
||||
auto instr = core_->decoder().decode(instr_code);
|
||||
if (!instr) {
|
||||
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
|
||||
|
||||
// Update trace
|
||||
trace->cid = core_->id();
|
||||
trace->wid = id_;
|
||||
trace->PC = PC_;
|
||||
trace->tmask = tmask_;
|
||||
trace->rdest = instr->getRDest();
|
||||
trace->rdest_type = instr->getRDType();
|
||||
|
||||
// Execute
|
||||
this->execute(*instr, trace);
|
||||
|
||||
DP(4, "Register state:");
|
||||
for (int i = 0; i < core_->arch().num_regs(); ++i) {
|
||||
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
|
||||
for (int j = 0; j < core_->arch().num_threads(); ++j) {
|
||||
DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
}
|
||||
DPN(4, std::endl);
|
||||
}
|
||||
}
|
||||
114
sim/simx/warp.h
Normal file
114
sim/simx/warp.h
Normal file
@@ -0,0 +1,114 @@
|
||||
#ifndef __WARP_H
|
||||
#define __WARP_H
|
||||
|
||||
#include <vector>
|
||||
#include <stack>
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Core;
|
||||
class Instr;
|
||||
class pipeline_trace_t;
|
||||
struct DomStackEntry {
|
||||
DomStackEntry(const ThreadMask &tmask, Word PC)
|
||||
: tmask(tmask)
|
||||
, PC(PC)
|
||||
, fallThrough(false)
|
||||
, unanimous(false)
|
||||
{}
|
||||
|
||||
DomStackEntry(const ThreadMask &tmask)
|
||||
: tmask(tmask)
|
||||
, PC(0)
|
||||
, fallThrough(true)
|
||||
, unanimous(false)
|
||||
{}
|
||||
|
||||
ThreadMask tmask;
|
||||
Word PC;
|
||||
bool fallThrough;
|
||||
bool unanimous;
|
||||
};
|
||||
|
||||
struct vtype {
|
||||
int vill;
|
||||
int vediv;
|
||||
int vsew;
|
||||
int vlmul;
|
||||
};
|
||||
|
||||
class Warp {
|
||||
public:
|
||||
Warp(Core *core, Word id);
|
||||
|
||||
bool active() const {
|
||||
return active_;
|
||||
}
|
||||
|
||||
void suspend() {
|
||||
active_ = false;
|
||||
}
|
||||
|
||||
void activate() {
|
||||
active_ = true;
|
||||
}
|
||||
|
||||
std::size_t getActiveThreads() const {
|
||||
if (active_)
|
||||
return tmask_.count();
|
||||
return 0;
|
||||
}
|
||||
|
||||
Word id() const {
|
||||
return id_;
|
||||
}
|
||||
|
||||
Word getPC() const {
|
||||
return PC_;
|
||||
}
|
||||
|
||||
void setPC(Word PC) {
|
||||
PC_ = PC;
|
||||
}
|
||||
|
||||
void setTmask(size_t index, bool value) {
|
||||
tmask_.set(index, value);
|
||||
active_ = tmask_.any();
|
||||
}
|
||||
|
||||
Word getTmask() const {
|
||||
if (active_)
|
||||
return tmask_.to_ulong();
|
||||
return 0;
|
||||
}
|
||||
|
||||
Word getIRegValue(int reg) const {
|
||||
return iRegFile_.at(0).at(reg);
|
||||
}
|
||||
|
||||
void eval(pipeline_trace_t *);
|
||||
|
||||
private:
|
||||
|
||||
void execute(const Instr &instr, pipeline_trace_t *trace);
|
||||
|
||||
Word id_;
|
||||
Core *core_;
|
||||
bool active_;
|
||||
|
||||
Word PC_;
|
||||
ThreadMask tmask_;
|
||||
|
||||
std::vector<std::vector<Word>> iRegFile_;
|
||||
std::vector<std::vector<Word>> fRegFile_;
|
||||
std::vector<std::vector<Byte>> vRegFile_;
|
||||
std::stack<DomStackEntry> domStack_;
|
||||
|
||||
struct vtype vtype_;
|
||||
int vl_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user