Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

View File

@@ -1,45 +1,36 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../hw/rtl
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I. -I../common -I../../hw
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
CXXFLAGS += $(CONFIGS)
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
VPATH := $(sort $(dir $(SRCS)))
#$(info OBJS is $(OBJS))
#$(info VPATH is $(VPATH))
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer
else
CXXFLAGS += -O2 -DNDEBUG
endif
# XLEN parameterization
ifdef XLEN
CXXFLAGS += -DXLEN=$(XLEN)
endif
PROJECT = simx
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
$(CXX) $(CXXFLAGS) -DSTARTUP_ADDR=0x80000000 $^ $(LDFLAGS) -o $@
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@
@@ -48,4 +39,4 @@ $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
rm -rf $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so

87
sim/simx/arch.h Normal file
View File

@@ -0,0 +1,87 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <sstream>
#include <cstdlib>
#include <stdio.h>
#include "types.h"
namespace vortex {
class Arch {
private:
uint16_t num_threads_;
uint16_t num_warps_;
uint16_t num_cores_;
uint16_t num_clusters_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
uint16_t ipdom_size_;
public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)
: num_threads_(num_threads)
, num_warps_(num_warps)
, num_cores_(num_cores)
, num_clusters_(num_clusters)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
, ipdom_size_((num_threads-1) * 2)
{}
uint16_t vsize() const {
return vsize_;
}
uint16_t num_regs() const {
return num_regs_;
}
uint16_t num_csrs() const {
return num_csrs_;
}
uint16_t num_barriers() const {
return num_barriers_;
}
uint16_t ipdom_size() const {
return ipdom_size_;
}
uint16_t num_threads() const {
return num_threads_;
}
uint16_t num_warps() const {
return num_warps_;
}
uint16_t num_cores() const {
return num_cores_;
}
uint16_t num_clusters() const {
return num_clusters_;
}
};
}

View File

@@ -1,70 +0,0 @@
#pragma once
#include <string>
#include <sstream>
#include <cstdlib>
#include <stdio.h>
#include "types.h"
namespace vortex {
class ArchDef {
private:
uint16_t num_cores_;
uint16_t num_warps_;
uint16_t num_threads_;
uint16_t wsize_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
public:
ArchDef(uint16_t num_cores,
uint16_t num_warps,
uint16_t num_threads)
: num_cores_(num_cores)
, num_warps_(num_warps)
, num_threads_(num_threads)
, wsize_(4)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
{}
uint16_t wsize() const {
return wsize_;
}
uint16_t vsize() const {
return vsize_;
}
uint16_t num_regs() const {
return num_regs_;
}
uint16_t num_csrs() const {
return num_csrs_;
}
uint16_t num_barriers() const {
return num_barriers_;
}
uint16_t num_threads() const {
return num_threads_;
}
uint16_t num_warps() const {
return num_warps_;
}
uint16_t num_cores() const {
return num_cores_;
}
};
}

View File

@@ -1,47 +0,0 @@
#include <iostream>
#include <string>
#include "args.h"
using namespace vortex;
using std::string;
std::string CommandLineArg::helpString_;
std::unordered_map<string, CommandLineArg *> CommandLineArg::longArgs_;
std::unordered_map<string, CommandLineArg *> CommandLineArg::shortArgs_;
CommandLineArg::CommandLineArg(string s, string l, const char *helpText) {
helpString_ += helpText;
longArgs_[l] = this;
shortArgs_[s] = this;
}
CommandLineArg::CommandLineArg(string l, const char *helpText) {
helpString_ += helpText;
longArgs_[l] = this;
}
void CommandLineArg::readArgs(int argc, char **argv) {
for (int i = 0; i < argc; i++) {
std::unordered_map<string, CommandLineArg *>::iterator
s = shortArgs_.find(std::string(argv[i])),
l = longArgs_.find(std::string(argv[i]));
if (s != shortArgs_.end()) {
i += s->second->read(argc - i, &argv[i]);
} else if (l != longArgs_.end()) {
i += l->second->read(argc - i, &argv[i]);
} else {
throw BadArg(string(argv[i]));
}
}
}
void CommandLineArg::clearArgs() {
shortArgs_.clear();
longArgs_.clear();
helpString_ = "";
}
void CommandLineArg::showHelp(std::ostream &os) {
os << helpString_;
}

View File

@@ -1,64 +0,0 @@
#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <util.h>
namespace vortex {
struct BadArg { BadArg(std::string s) : arg(s) {} std::string arg; };
class CommandLineArg {
public:
CommandLineArg(std::string s, std::string l, const char *helpText);
CommandLineArg(std::string l, const char *helpText);
virtual int read(int argc, char** argv) = 0;
static void readArgs(int argc, char **argv);
static void clearArgs();
static void showHelp(std::ostream &os);
private:
static std::string helpString_;
static std::unordered_map<std::string, CommandLineArg *> longArgs_;
static std::unordered_map<std::string, CommandLineArg *> shortArgs_;
};
template <typename T> class CommandLineArgSetter : public CommandLineArg {
public:
CommandLineArgSetter(std::string s, std::string l, const char *ht, T &x) :
CommandLineArg(s, l, ht), arg_(x) {}
CommandLineArgSetter(std::string l, const char *ht, T &x) :
CommandLineArg(l, ht), arg_(x) {}
int read(int argc, char **argv) {
__unused (argc);
std::istringstream iss(argv[1]);
iss >> arg_;
return 1;
}
private:
T &arg_;
};
class CommandLineArgFlag : public CommandLineArg {
public:
CommandLineArgFlag(std::string s, std::string l, const char *ht, bool &x) :
CommandLineArg(s, l, ht), arg_(x) { arg_ = false; }
CommandLineArgFlag(std::string l, const char *ht, bool &x) :
CommandLineArg(l, ht), arg_(x) { arg_ = false; }
int read(int argc, char **argv) {
__unused (argc, argv);
arg_ = true;
return 0;
}
private:
bool &arg_;
};
}

View File

@@ -1,637 +0,0 @@
#include "cache.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t blocks_per_set;
uint32_t words_per_block;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const Cache::Config& config) {
uint32_t bank_bits = log2ceil(config.num_banks);
uint32_t offset_bits = config.B - config.W;
uint32_t log2_bank_size = config.C - bank_bits;
uint32_t index_bits = log2_bank_size - (config.B << config.A);
assert(log2_bank_size >= config.B);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_block = 1 << offset_bits;
this->blocks_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_block);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct block_t {
bool valid;
bool dirty;
uint64_t tag;
uint32_t lru_ctr;
};
struct set_t {
std::vector<block_t> blocks;
set_t(uint32_t size) : blocks(size) {}
void clear() {
for (auto& block : blocks) {
block.valid = false;
}
}
};
struct bank_req_info_t {
bool valid;
uint32_t req_id;
uint64_t req_tag;
};
struct bank_req_t {
bool valid;
bool write;
bool mshr_replay;
uint64_t tag;
uint32_t set_id;
uint32_t core_id;
uint64_t uuid;
std::vector<bank_req_info_t> infos;
bank_req_t(uint32_t size)
: valid(false)
, write(false)
, mshr_replay(false)
, tag(0)
, set_id(0)
, core_id(0)
, uuid(0)
, infos(size)
{}
};
struct mshr_entry_t : public bank_req_t {
uint32_t block_id;
mshr_entry_t(uint32_t size = 0)
: bank_req_t(size)
, block_id(0)
{}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t size_;
public:
MSHR(uint32_t size)
: entries_(size)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
int lookup(const bank_req_t& bank_req) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.valid
&& entry.set_id == bank_req.set_id
&& entry.tag == bank_req.tag) {
return i;
}
}
return -1;
}
int allocate(const bank_req_t& bank_req, uint32_t block_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (!entry.valid) {
*(bank_req_t*)&entry = bank_req;
entry.valid = true;
entry.mshr_replay = false;
entry.block_id = block_id;
++size_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.valid);
// make all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.valid
&& entry.set_id == root_entry.set_id
&& entry.tag == root_entry.tag) {
entry.mshr_replay = true;
}
}
return root_entry;
}
bool pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
*out = entry;
entry.valid = false;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
entry.valid = false;
}
}
size_ = 0;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const Cache::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.blocks_per_set)
, mshr(config.mshr_size)
{}
void clear() {
mshr.clear();
for (auto& set : sets) {
set.clear();
}
}
};
///////////////////////////////////////////////////////////////////////////////
class Cache::Impl {
private:
Cache* const simobject_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr mem_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
uint32_t flush_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(Cache* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
{
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspIn);
if (config.num_banks > 1) {
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
}
mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
} else {
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate tag flush cycles
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
}
void reset() {
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
// wait on flush cycles
if (flush_cycles_ != 0) {
--flush_cycles_;
return;
}
// per-bank pipeline request
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
// handle bypasss responses
auto& bypass_port = bypass_switch_->RspOut.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.front();
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
bypass_port.pop();
}
// handle MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// handle memory fills
std::vector<bool> pending_fill_req(config_.num_banks, false);
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (!mem_rsp_port.empty()) {
auto& mem_rsp = mem_rsp_port.front();
this->processMemoryFill(bank_id, mem_rsp.tag);
pending_fill_req.at(bank_id) = true;
mem_rsp_port.pop();
}
}
// handle incoming core requests
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.non_cacheable) {
// send IO request
this->processIORequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
// create bank request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.valid = true;
bank_req.write = core_req.write;
bank_req.mshr_replay = false;
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.core_id = core_req.core_id;
bank_req.uuid = core_req.uuid;
bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
// check pending MSHR replay
if (pipeline_req.valid
&& pipeline_req.mshr_replay) {
// stall
continue;
}
// check pending fill request
if (pending_fill_req.at(bank_id)) {
// stall
continue;
}
// check MSHR capacity if read or writeback
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
continue;
}
// check bank conflicts
if (pipeline_req.valid) {
// check port conflict
if (pipeline_req.write != core_req.write
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.infos[port_id].valid) {
++perf_stats_.bank_stalls;
continue;
}
// update pending request infos
pipeline_req.infos[port_id] = bank_req.infos[port_id];
} else {
// schedule new request
pipeline_req = bank_req;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequest(pipeline_reqs);
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processIORequest(const MemReq& core_req, uint32_t req_id) {
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
// update block
auto& bank = banks_.at(bank_id);
auto& entry = bank.mshr.replay(mshr_id);
auto& set = bank.sets.at(entry.set_id);
auto& block = set.blocks.at(entry.block_id);
block.valid = true;
block.tag = entry.tag;
--pending_fill_reqs_;
}
void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& pipeline_req = pipeline_reqs.at(bank_id);
if (!pipeline_req.valid)
continue;
auto& bank = banks_.at(bank_id);
auto& set = bank.sets.at(pipeline_req.set_id);
if (pipeline_req.mshr_replay) {
// send core response
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
} else {
bool hit = false;
bool found_free_block = false;
uint32_t hit_block_id = 0;
uint32_t repl_block_id = 0;
uint32_t max_cnt = 0;
for (uint32_t i = 0, n = set.blocks.size(); i < n; ++i) {
auto& block = set.blocks.at(i);
if (block.valid) {
if (block.tag == pipeline_req.tag) {
block.lru_ctr = 0;
hit_block_id = i;
hit = true;
} else {
++block.lru_ctr;
}
if (max_cnt < block.lru_ctr) {
max_cnt = block.lru_ctr;
repl_block_id = i;
}
} else {
found_free_block = true;
repl_block_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
auto& hit_block = set.blocks.at(hit_block_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
} else {
// mark block as dirty
hit_block.dirty = true;
}
}
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_block && !config_.write_through) {
// write back dirty block
auto& repl_block = set.blocks.at(repl_block_id);
if (repl_block.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++perf_stats_.evictions;
}
}
if (pipeline_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
// MSHR lookup
int pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
// send fill request
if (pending == -1) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++pending_fill_reqs_;
}
}
}
}
}
}
};
///////////////////////////////////////////////////////////////////////////////
Cache::Cache(const SimContext& ctx, const char* name, const Config& config)
: SimObject<Cache>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
Cache::~Cache() {
delete impl_;
}
void Cache::reset() {
impl_->reset();
}
void Cache::tick() {
impl_->tick();
}
const Cache::PerfStats& Cache::perf_stats() const {
return impl_->perf_stats();
}

106
sim/simx/cache_cluster.h Normal file
View File

@@ -0,0 +1,106 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "cache_sim.h"
namespace vortex {
class CacheCluster : public SimObject<CacheCluster> {
public:
std::vector<std::vector<SimPort<MemReq>>> CoreReqPorts;
std::vector<std::vector<SimPort<MemRsp>>> CoreRspPorts;
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
CacheCluster(const SimContext& ctx,
const char* name,
uint32_t num_units,
uint32_t num_caches,
uint32_t num_requests,
const CacheSim::Config& config)
: SimObject(ctx, name)
, CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
, CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
, MemReqPort(this)
, MemRspPort(this)
, caches_(MAX(num_caches, 0x1)) {
CacheSim::Config config2(config);
if (0 == num_caches) {
num_caches = 1;
config2.bypass = true;
}
char sname[100];
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
for (uint32_t u = 0; u < num_units; ++u) {
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
for (uint32_t i = 0; i < num_requests; ++i) {
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
}
}
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
for (uint32_t i = 0; i < config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
for (uint32_t u = 0; u < num_units; ++u) {
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
}
}
snprintf(sname, 100, "%s-cache-arb", name);
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
for (uint32_t i = 0; i < num_caches; ++i) {
snprintf(sname, 100, "%s-cache%d", name, i);
caches_.at(i) = CacheSim::Create(sname, config2);
for (uint32_t j = 0; j < config.num_inputs; ++j) {
mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
}
caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i));
cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort);
}
cache_arb->ReqOut.at(0).bind(&this->MemReqPort);
this->MemRspPort.bind(&cache_arb->RspOut.at(0));
}
~CacheCluster() {}
void reset() {}
void tick() {}
CacheSim::PerfStats perf_stats() const {
CacheSim::PerfStats perf;
for (auto cache : caches_) {
perf += cache->perf_stats();
}
return perf;
}
private:
std::vector<CacheSim::Ptr> caches_;
};
}

707
sim/simx/cache_sim.cpp Normal file
View File

@@ -0,0 +1,707 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cache_sim.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t lines_per_set;
uint32_t words_per_line;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const CacheSim::Config& config) {
int32_t bank_bits = log2ceil(config.num_banks);
int32_t offset_bits = config.B - config.W;
int32_t log2_bank_size = config.C - bank_bits;
int32_t index_bits = log2_bank_size - (config.B + config.A);
assert(log2_bank_size > 0);
assert(offset_bits >= 0);
assert(index_bits >= 0);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_line = 1 << offset_bits;
this->lines_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_line);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct line_t {
uint64_t tag;
uint32_t lru_ctr;
bool valid;
bool dirty;
void clear() {
valid = false;
dirty = false;
}
};
struct set_t {
std::vector<line_t> lines;
set_t(uint32_t num_ways)
: lines(num_ways)
{}
void clear() {
for (auto& line : lines) {
line.clear();
}
}
};
struct bank_req_port_t {
uint32_t req_id;
uint64_t req_tag;
bool valid;
void clear() {
valid = false;
}
};
struct bank_req_t {
enum ReqType {
None = 0,
Fill = 1,
Replay = 2,
Core = 3
};
std::vector<bank_req_port_t> ports;
uint64_t tag;
uint32_t set_id;
uint32_t cid;
uint64_t uuid;
ReqType type;
bool write;
bank_req_t(uint32_t num_ports)
: ports(num_ports)
{}
void clear() {
for (auto& port : ports) {
port.clear();
}
type = ReqType::None;
}
};
struct mshr_entry_t {
bank_req_t bank_req;
uint32_t line_id;
mshr_entry_t(uint32_t num_ports)
: bank_req(num_ports)
{}
void clear() {
bank_req.clear();
}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t size_;
public:
MSHR(uint32_t size, uint32_t num_ports)
: entries_(size, num_ports)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
bool lookup(const bank_req_t& bank_req) {
for (auto& entry : entries_) {;
if (entry.bank_req.type != bank_req_t::None
&& entry.bank_req.set_id == bank_req.set_id
&& entry.bank_req.tag == bank_req.tag) {
return true;
}
}
return false;
}
int allocate(const bank_req_t& bank_req, uint32_t line_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.bank_req.type == bank_req_t::None) {
entry.bank_req = bank_req;
entry.line_id = line_id;
++size_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.bank_req.type == bank_req_t::Core);
// mark all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.bank_req.type == bank_req_t::Core
&& entry.bank_req.set_id == root_entry.bank_req.set_id
&& entry.bank_req.tag == root_entry.bank_req.tag) {
entry.bank_req.type = bank_req_t::Replay;
}
}
return root_entry;
}
bool pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.bank_req.type == bank_req_t::Replay) {
*out = entry.bank_req;
entry.bank_req.type = bank_req_t::None;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
entry.clear();
}
size_ = 0;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const CacheSim::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.lines_per_set)
, mshr(config.mshr_size, config.ports_per_bank)
{}
void clear() {
for (auto& set : sets) {
set.clear();
}
mshr.clear();
}
};
///////////////////////////////////////////////////////////////////////////////
class CacheSim::Impl {
private:
CacheSim* const simobject_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr bank_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
std::vector<bank_req_t> pipeline_reqs_;
uint32_t init_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(CacheSim* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
, pipeline_reqs_(config.num_banks, config.ports_per_bank)
{
char sname[100];
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
if (config_.bypass) {
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
for (uint32_t i = 0; i < config_.num_inputs; ++i) {
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
}
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
return;
}
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
if (config.num_banks > 1) {
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
}
bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0));
} else {
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate cache initialization cycles
init_cycles_ = params_.sets_per_bank * params_.lines_per_set;
}
void reset() {
if (config_.bypass)
return;
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
if (config_.bypass)
return;
// wait on cache initialization cycles
if (init_cycles_ != 0) {
--init_cycles_;
return;
}
// handle cache bypasss responses
{
auto& bypass_port = bypass_switch_->RspIn.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.front();
this->processBypassResponse(mem_rsp);
bypass_port.pop();
}
}
// initialize pipeline request
for (auto& pipeline_req : pipeline_reqs_) {
pipeline_req.clear();
}
// schedule MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// schedule memory fill
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (mem_rsp_port.empty())
continue;
auto& pipeline_req = pipeline_reqs_.at(bank_id);
if (pipeline_req.type != bank_req_t::None)
continue;
auto& mem_rsp = mem_rsp_port.front();
DT(3, simobject_->name() << "-dram-" << mem_rsp);
pipeline_req.type = bank_req_t::Fill;
pipeline_req.tag = mem_rsp.tag;
mem_rsp_port.pop();
}
// schedule core requests
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.type == AddrType::IO) {
// send bypass request
this->processBypassRequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// check MSHR capacity
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
++perf_stats_.bank_stalls;
continue;
}
// check bank conflicts
if (pipeline_req.type == bank_req_t::Core) {
// check port conflict
if (pipeline_req.write != core_req.write
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.ports.at(port_id).valid) {
++perf_stats_.bank_stalls;
continue;
}
// extend request ports
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
} else if (pipeline_req.type == bank_req_t::None) {
// schedule new request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.cid = core_req.cid;
bank_req.uuid = core_req.uuid;
bank_req.type = bank_req_t::Core;
bank_req.write = core_req.write;
pipeline_req = bank_req;
} else {
// bank in use
++perf_stats_.bank_stalls;
continue;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
DT(3, simobject_->name() << "-core-" << core_req);
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequests();
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processBypassResponse(const MemRsp& mem_rsp) {
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
void processBypassRequest(const MemReq& core_req, uint32_t req_id) {
DT(3, simobject_->name() << "-core-" << core_req);
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
void processBankRequests() {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto pipeline_req = pipeline_reqs_.at(bank_id);
switch (pipeline_req.type) {
case bank_req_t::None:
break;
case bank_req_t::Fill: {
// update cache line
auto& bank = banks_.at(bank_id);
auto& entry = bank.mshr.replay(pipeline_req.tag);
auto& set = bank.sets.at(entry.bank_req.set_id);
auto& line = set.lines.at(entry.line_id);
line.valid = true;
line.tag = entry.bank_req.tag;
--pending_fill_reqs_;
} break;
case bank_req_t::Replay: {
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} break;
case bank_req_t::Core: {
bool hit = false;
bool found_free_line = false;
uint32_t hit_line_id = 0;
uint32_t repl_line_id = 0;
uint32_t max_cnt = 0;
auto& set = bank.sets.at(pipeline_req.set_id);
// tag lookup
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
auto& line = set.lines.at(i);
if (line.valid) {
if (line.tag == pipeline_req.tag) {
line.lru_ctr = 0;
hit_line_id = i;
hit = true;
} else {
++line.lru_ctr;
}
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
} else {
found_free_line = true;
repl_line_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
auto& hit_line = set.lines.at(hit_line_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
} else {
// mark line as dirty
hit_line.dirty = true;
}
}
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_line && !config_.write_through) {
// write back dirty line
auto& repl_line = set.lines.at(repl_line_id);
if (repl_line.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++perf_stats_.evictions;
}
}
if (pipeline_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
// MSHR lookup
auto mshr_pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
// send fill request
if (!mshr_pending) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++pending_fill_reqs_;
}
}
}
} break;
}
}
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
}
};
///////////////////////////////////////////////////////////////////////////////
CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config)
: SimObject<CacheSim>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
CacheSim::~CacheSim() {
delete impl_;
}
void CacheSim::reset() {
impl_->reset();
}
void CacheSim::tick() {
impl_->tick();
}
const CacheSim::PerfStats& CacheSim::perf_stats() const {
return impl_->perf_stats();
}

View File

@@ -1,13 +1,27 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "memsim.h"
#include "mem_sim.h"
namespace vortex {
class Cache : public SimObject<Cache> {
class CacheSim : public SimObject<CacheSim> {
public:
struct Config {
bool bypass; // cache bypass
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t W; // log2 word size
@@ -45,6 +59,19 @@ public:
, mshr_stalls(0)
, mem_latency(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->read_misses += rhs.read_misses;
this->write_misses += rhs.write_misses;
this->evictions += rhs.evictions;
this->pipeline_stalls += rhs.pipeline_stalls;
this->bank_stalls += rhs.bank_stalls;
this->mshr_stalls += rhs.mshr_stalls;
this->mem_latency += rhs.mem_latency;
return *this;
}
};
std::vector<SimPort<MemReq>> CoreReqPorts;
@@ -52,8 +79,8 @@ public:
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
Cache(const SimContext& ctx, const char* name, const Config& config);
~Cache();
CacheSim(const SimContext& ctx, const char* name, const Config& config);
~CacheSim();
void reset();

222
sim/simx/cluster.cpp Normal file
View File

@@ -0,0 +1,222 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cluster.h"
using namespace vortex;
Cluster::Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch, const
DCRS &dcrs)
: SimObject(ctx, "cluster")
, mem_req_port(this)
, mem_rsp_port(this)
, cluster_id_(cluster_id)
, cores_(arch.num_cores())
, barriers_(arch.num_barriers(), 0)
, sharedmems_(arch.num_cores())
, processor_(processor)
{
auto num_cores = arch.num_cores();
char sname[100];
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L2_NUM_WAYS), // W
0, // A
XLEN, // address bits
L2_NUM_BANKS, // number of banks
1, // number of ports
5, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
l2cache_->MemReqPort.bind(&this->mem_req_port);
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
XLEN, // address bits
1, // number of banks
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
0, // victim size
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
XLEN, // address bits
DCACHE_NUM_BANKS, // number of banks
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
///////////////////////////////////////////////////////////////////////////
// create shared memory blocks
for (uint32_t i = 0; i < num_cores; ++i) {
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
}
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
uint32_t core_id = cluster_id * num_cores + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs,
sharedmems_.at(i));
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
auto smem_demux = SMemDemux::Create(sname);
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
}
}
}
Cluster::~Cluster() {
//--
}
void Cluster::reset() {
for (auto& barrier : barriers_) {
barrier.reset();
}
}
void Cluster::tick() {
//--
}
void Cluster::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
bool Cluster::running() const {
for (auto& core : cores_) {
if (core->running())
return true;
}
return false;
}
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
for (auto& core : cores_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
}
}
*exitcode = exitcode_;
return done;
}
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id);
uint32_t local_core_id = core_id % cores_.size();
barrier.set(local_core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) {
// resume all suspended cores
for (uint32_t i = 0; i < cores_.size(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
cores_.at(i)->resume();
}
}
barrier.reset();
}
}
ProcessorImpl* Cluster::processor() const {
return processor_;
}
Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
perf.tcache = tcaches_->perf_stats();
perf.ocache = ocaches_->perf_stats();
perf.rcache = rcaches_->perf_stats();
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();
}
return perf;
}

92
sim/simx/cluster.h Normal file
View File

@@ -0,0 +1,92 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "dcrs.h"
#include "arch.h"
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "constants.h"
namespace vortex {
class ProcessorImpl;
class Cluster : public SimObject<Cluster> {
public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
SharedMem::PerfStats sharedmem;
CacheSim::PerfStats l2cache;
CacheSim::PerfStats tcache;
CacheSim::PerfStats ocache;
CacheSim::PerfStats rcache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
this->sharedmem += rhs.sharedmem;
this->l2cache += rhs.l2cache;
this->tcache += rhs.tcache;
this->ocache += rhs.ocache;
this->rcache += rhs.rcache;
return *this;
}
};
SimPort<MemReq> mem_req_port;
SimPort<MemRsp> mem_rsp_port;
Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch,
const DCRS &dcrs);
~Cluster();
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
ProcessorImpl* processor() const;
Cluster::PerfStats perf_stats() const;
private:
uint32_t cluster_id_;
std::vector<Core::Ptr> cores_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
std::vector<SharedMem::Ptr> sharedmems_;
CacheCluster::Ptr tcaches_;
CacheCluster::Ptr ocaches_;
CacheCluster::Ptr rcaches_;
ProcessorImpl* processor_;
};
} // namespace vortex

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef RAM_PAGE_SIZE
@@ -10,14 +23,4 @@
#ifndef MEMORY_BANKS
#define MEMORY_BANKS 2
#endif
namespace vortex {
enum Constants {
SMEM_BANK_OFFSET = log2ceil(sizeof(uint32_t)) + log2ceil(STACK_SIZE / sizeof(uint32_t)),
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
@@ -11,101 +24,104 @@
#include <simobject.h>
#include "debug.h"
#include "types.h"
#include "archdef.h"
#include "arch.h"
#include "decode.h"
#include "mem.h"
#include "warp.h"
#include "pipeline.h"
#include "cache.h"
#include "sharedmem.h"
#include "cache_sim.h"
#include "shared_mem.h"
#include "ibuffer.h"
#include "scoreboard.h"
#include "exeunit.h"
#include "tex_unit.h"
#include "operand.h"
#include "dispatcher.h"
#include "exe_unit.h"
#include "dcrs.h"
namespace vortex {
class Cluster;
class Core : public SimObject<Core> {
public:
struct PerfStats {
uint64_t cycles;
uint64_t instrs;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
uint64_t lsu_stalls;
uint64_t csr_stalls;
uint64_t fpu_stalls;
uint64_t gpu_stalls;
uint64_t sfu_stalls;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
uint64_t branches;
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
uint64_t tex_reads;
uint64_t tex_latency;
uint64_t ifetch_latency;
uint64_t load_latency;
PerfStats()
: instrs(0)
: cycles(0)
, instrs(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)
, lsu_stalls(0)
, csr_stalls(0)
, fpu_stalls(0)
, gpu_stalls(0)
, sfu_stalls(0)
, ifetches(0)
, loads(0)
, stores(0)
, branches(0)
, mem_reads(0)
, mem_writes(0)
, mem_latency(0)
, tex_reads(0)
, tex_latency(0)
, ifetch_latency(0)
, load_latency(0)
{}
};
SimPort<MemRsp> MemRspPort;
SimPort<MemReq> MemReqPort;
std::vector<SimPort<MemReq>> icache_req_ports;
std::vector<SimPort<MemRsp>> icache_rsp_ports;
std::vector<SimPort<MemReq>> dcache_req_ports;
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
Core(const SimContext& ctx,
uint32_t core_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs,
SharedMem::Ptr sharedmem);
Core(const SimContext& ctx, const ArchDef &arch, uint32_t id);
~Core();
void attach_ram(RAM* ram);
bool running() const;
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
void resume();
uint32_t id() const {
return id_;
return core_id_;
}
const Decoder& decoder() {
return decoder_;
}
const ArchDef& arch() const {
const Arch& arch() const {
return arch_;
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
uint32_t getIRegValue(int reg) const {
return warps_.at(0)->getIRegValue(reg);
const DCRS& dcrs() const {
return dcrs_;
}
uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);
WarpMask wspawn(uint32_t num_warps, uint32_t nextPC);
void wspawn(uint32_t num_warps, Word nextPC);
WarpMask barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
AddrType get_addr_type(uint64_t addr);
void icache_read(void* data, uint64_t addr, uint32_t size);
@@ -113,19 +129,22 @@ public:
void dcache_write(const void* data, uint64_t addr, uint32_t size);
uint32_t tex_read(uint32_t unit, uint32_t lod, uint32_t u, uint32_t v, std::vector<mem_addr_size_t>* mem_addrs);
void dcache_amo_reserve(uint64_t addr);
bool dcache_amo_check(uint64_t addr);
void trigger_ecall();
void trigger_ebreak();
bool check_exit() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
private:
void schedule();
void fetch();
void decode();
void issue();
void execute();
void commit();
@@ -133,49 +152,55 @@ private:
void cout_flush();
uint32_t id_;
const ArchDef arch_;
uint32_t core_id_;
const Arch& arch_;
const DCRS &dcrs_;
const Decoder decoder_;
MemoryUnit mmu_;
RAM smem_;
std::vector<TexUnit> tex_units_;
std::vector<std::shared_ptr<Warp>> warps_;
std::vector<WarpMask> barriers_;
std::vector<uint32_t> csrs_;
std::vector<WarpMask> barriers_;
std::vector<Byte> fcsrs_;
std::vector<IBuffer> ibuffers_;
Scoreboard scoreboard_;
std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<ExeUnit::Ptr> exe_units_;
Cache::Ptr icache_;
Cache::Ptr dcache_;
SharedMem::Ptr shared_mem_;
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
SharedMem::Ptr sharedmem_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;
HashTable<pipeline_trace_t*> pending_icache_;
std::vector<pipeline_trace_t*> committed_traces_;
WarpMask active_warps_;
WarpMask stalled_warps_;
uint32_t last_schedule_wid_;
uint64_t issued_instrs_;
uint64_t committed_instrs_;
uint32_t csr_tex_unit_;
bool ecall_;
bool ebreak_;
bool exited_;
uint64_t pending_ifetches_;
std::unordered_map<int, std::stringstream> print_bufs_;
std::vector<std::vector<CSRs>> csrs_;
PerfStats perf_stats_;
uint64_t perf_mem_pending_reads_;
Cluster* cluster_;
uint32_t commit_exe_;
friend class Warp;
friend class LsuUnit;
friend class AluUnit;
friend class CsrUnit;
friend class FpuUnit;
friend class GpuUnit;
friend class SfuUnit;
friend class TexUnit;
friend class RasterAgent;
friend class RopAgent;
friend class TexAgent;
};
} // namespace vortex
} // namespace vortex

28
sim/simx/dcrs.cpp Normal file
View File

@@ -0,0 +1,28 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dcrs.h"
#include <iostream>
using namespace vortex;
void DCRS::write(uint32_t addr, uint32_t value) {
if (addr >= VX_DCR_BASE_STATE_BEGIN
&& addr < VX_DCR_BASE_STATE_END) {
base_dcrs.write(addr, value);
return;
}
std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl;
std::abort();
}

45
sim/simx/dcrs.h Normal file
View File

@@ -0,0 +1,45 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <util.h>
#include <VX_types.h>
#include <array>
namespace vortex {
class BaseDCRS {
public:
uint32_t read(uint32_t addr) const {
uint32_t state = VX_DCR_BASE_STATE(addr);
return states_.at(state);
}
void write(uint32_t addr, uint32_t value) {
uint32_t state = VX_DCR_BASE_STATE(addr);
states_.at(state) = value;
}
private:
std::array<uint32_t, VX_DCR_BASE_STATE_COUNT> states_;
};
class DCRS {
public:
void write(uint32_t addr, uint32_t value);
BaseDCRS base_dcrs;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef DEBUG_LEVEL

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <string>
#include <stdlib.h>
@@ -9,41 +22,36 @@
#include "debug.h"
#include "types.h"
#include "decode.h"
#include "archdef.h"
#include "arch.h"
#include "instr.h"
using namespace vortex;
struct InstTableEntry_t {
bool controlFlow;
InstType iType;
};
static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable = {
{Opcode::NOP, {false, InstType::N_TYPE}},
{Opcode::R_INST, {false, InstType::R_TYPE}},
{Opcode::L_INST, {false, InstType::I_TYPE}},
{Opcode::I_INST, {false, InstType::I_TYPE}},
{Opcode::S_INST, {false, InstType::S_TYPE}},
{Opcode::B_INST, {true , InstType::B_TYPE}},
{Opcode::LUI_INST, {false, InstType::U_TYPE}},
{Opcode::AUIPC_INST, {false, InstType::U_TYPE}},
{Opcode::JAL_INST, {true , InstType::J_TYPE}},
{Opcode::JALR_INST, {true , InstType::I_TYPE}},
{Opcode::SYS_INST, {true , InstType::I_TYPE}},
{Opcode::FENCE, {true , InstType::I_TYPE}},
{Opcode::FL, {false, InstType::I_TYPE}},
{Opcode::FS, {false, InstType::S_TYPE}},
{Opcode::FCI, {false, InstType::R_TYPE}},
{Opcode::FMADD, {false, InstType::R4_TYPE}},
{Opcode::FMSUB, {false, InstType::R4_TYPE}},
{Opcode::FMNMADD, {false, InstType::R4_TYPE}},
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
{Opcode::VSET, {false, InstType::V_TYPE}},
{Opcode::GPGPU, {false, InstType::R_TYPE}},
{Opcode::GPU, {false, InstType::R4_TYPE}},
{Opcode::R_INST_W, {false, InstType::R_TYPE}},
{Opcode::I_INST_W, {false, InstType::I_TYPE}},
static const std::unordered_map<Opcode, InstType> sc_instTable = {
{Opcode::R_INST, InstType::R_TYPE},
{Opcode::L_INST, InstType::I_TYPE},
{Opcode::I_INST, InstType::I_TYPE},
{Opcode::S_INST, InstType::S_TYPE},
{Opcode::B_INST, InstType::B_TYPE},
{Opcode::LUI_INST, InstType::U_TYPE},
{Opcode::AUIPC_INST, InstType::U_TYPE},
{Opcode::JAL_INST, InstType::J_TYPE},
{Opcode::JALR_INST, InstType::I_TYPE},
{Opcode::SYS_INST, InstType::I_TYPE},
{Opcode::FENCE, InstType::I_TYPE},
{Opcode::AMO, InstType::R_TYPE},
{Opcode::FL, InstType::I_TYPE},
{Opcode::FS, InstType::S_TYPE},
{Opcode::FCI, InstType::R_TYPE},
{Opcode::FMADD, InstType::R4_TYPE},
{Opcode::FMSUB, InstType::R4_TYPE},
{Opcode::FMNMADD, InstType::R4_TYPE},
{Opcode::FMNMSUB, InstType::R4_TYPE},
{Opcode::VSET, InstType::V_TYPE},
{Opcode::EXT1, InstType::R_TYPE},
{Opcode::EXT2, InstType::R4_TYPE},
{Opcode::R_INST_W, InstType::R_TYPE},
{Opcode::I_INST_W, InstType::I_TYPE},
};
enum Constants {
@@ -58,6 +66,8 @@ enum Constants {
width_i_imm = 12,
width_j_imm = 20,
width_v_imm = 11,
width_aq = 1,
width_rl = 1,
shift_opcode= 0,
shift_rd = width_opcode,
@@ -72,15 +82,15 @@ enum Constants {
shift_func6 = shift_func7 + width_vmask,
shift_vset = shift_func7 + width_func6,
mask_opcode = (1<<width_opcode)-1,
mask_reg = (1<<width_reg)-1,
mask_func2 = (1<<width_func2)-1,
mask_func3 = (1<<width_func3)-1,
mask_func6 = (1<<width_func6)-1,
mask_func7 = (1<<width_func7)-1,
mask_i_imm = (1<<width_i_imm)-1,
mask_j_imm = (1<<width_j_imm)-1,
mask_v_imm = (1<<width_v_imm)-1,
mask_opcode = (1 << width_opcode) - 1,
mask_reg = (1 << width_reg) - 1,
mask_func2 = (1 << width_func2) - 1,
mask_func3 = (1 << width_func3) - 1,
mask_func6 = (1 << width_func6) - 1,
mask_func7 = (1 << width_func7) - 1,
mask_i_imm = (1 << width_i_imm) - 1,
mask_j_imm = (1 << width_j_imm) - 1,
mask_v_imm = (1 << width_v_imm) - 1,
};
static const char* op_string(const Instr &instr) {
@@ -92,7 +102,6 @@ static const char* op_string(const Instr &instr) {
auto imm = instr.getImm();
switch (opcode) {
case Opcode::NOP: return "NOP";
case Opcode::LUI_INST: return "LUI";
case Opcode::AUIPC_INST: return "AUIPC";
case Opcode::R_INST:
@@ -116,7 +125,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SLT";
case 3: return "SLTU";
case 4: return "XOR";
case 5: return func7 ? "SRA" : "SRL";
case 5: return (func7 & 0x20) ? "SRA" : "SRL";
case 6: return "OR";
case 7: return "AND";
default:
@@ -130,7 +139,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SLTI";
case 3: return "SLTIU";
case 4: return "XORI";
case 5: return func7 ? "SRAI" : "SRLI";
case 5: return (func7 & 0x20) ? "SRAI" : "SRLI";
case 6: return "ORI";
case 7: return "ANDI";
default:
@@ -151,8 +160,8 @@ static const char* op_string(const Instr &instr) {
case Opcode::JALR_INST: return "JALR";
case Opcode::L_INST:
switch (func3) {
case 0: return "LBI";
case 1: return "LHI";
case 0: return "LB";
case 1: return "LH";
case 2: return "LW";
case 3: return "LD";
case 4: return "LBU";
@@ -192,11 +201,11 @@ static const char* op_string(const Instr &instr) {
}
case Opcode::I_INST_W:
switch (func3) {
case 0: return "ADDIW";
case 1: return "SLLIW";
case 5: return func7 ? "SRAIW" : "SRLIW";
default:
std::abort();
case 0: return "ADDIW";
case 1: return "SLLIW";
case 5: return func7 ? "SRAIW" : "SRLIW";
default:
std::abort();
}
case Opcode::SYS_INST:
switch (func3) {
@@ -222,20 +231,59 @@ static const char* op_string(const Instr &instr) {
case Opcode::FENCE: return "FENCE";
case Opcode::FL:
switch (func3) {
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
std::abort();
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
std::abort();
}
case Opcode::FS:
switch (func3) {
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
default:
std::abort();
}
case Opcode::AMO: {
auto amo_type = func7 >> 2;
switch (func3) {
case 0x2:
switch (amo_type) {
case 0x00: return "AMOADD.W";
case 0x01: return "AMOSWAP.W";
case 0x02: return "LR.W";
case 0x03: return "SC.W";
case 0x04: return "AMOXOR.W";
case 0x08: return "AMOOR.W";
case 0x0c: return "AMOAND.W";
case 0x10: return "AMOMIN.W";
case 0x14: return "AMOMAX.W";
case 0x18: return "AMOMINU.W";
case 0x1c: return "AMOMAXU.W";
default:
std::abort();
}
case 0x3:
switch (amo_type) {
case 0x00: return "AMOADD.D";
case 0x01: return "AMOSWAP.D";
case 0x02: return "LR.D";
case 0x03: return "SC.D";
case 0x04: return "AMOXOR.D";
case 0x08: return "AMOOR.D";
case 0x0c: return "AMOAND.D";
case 0x10: return "AMOMIN.D";
case 0x14: return "AMOMAX.D";
case 0x18: return "AMOMINU.D";
case 0x1c: return "AMOMAXU.D";
default:
std::abort();
}
default:
std::abort();
}
}
case Opcode::FCI:
switch (func7) {
case 0x00: return "FADD.S";
@@ -332,9 +380,9 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}
case 0x70: return func3 ? "FCLASS.S" : "FMV.X.W";
case 0x70: return func3 ? "FCLASS.S" : "FMV.X.S";
case 0x71: return func3 ? "FCLASS.D" : "FMV.X.D";
case 0x78: return "FMV.W.X";
case 0x78: return "FMV.S.X";
case 0x79: return "FMV.D.X";
default:
std::abort();
@@ -344,23 +392,36 @@ static const char* op_string(const Instr &instr) {
case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
case Opcode::VSET: return "VSET";
case Opcode::GPGPU:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PREFETCH";
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PRED";
default:
std::abort();
}
case 1:
switch (func3) {
case 0: return "RASTER";
default:
std::abort();
}
default:
std::abort();
}
case Opcode::GPU:
case Opcode::EXT2:
switch (func3) {
case 0: return "TEX";
case 0:
return "TEX";
case 1: {
switch (func2) {
case 0: return "CMOV";
case 1: return "ROP";
default:
std::abort();
}
@@ -375,43 +436,36 @@ static const char* op_string(const Instr &instr) {
namespace vortex {
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
auto opcode = instr.getOpcode();
auto func2 = instr.getFunc2();
auto opcode = instr.getOpcode();
auto func3 = instr.getFunc3();
os << op_string(instr) << ": ";
if (opcode == S_INST
|| opcode == FS) {
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
} else
if (opcode == L_INST
|| opcode == FL) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
} else {
if (instr.getRDType() != RegType::None) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
}
uint32_t i = 0;
for (; i < instr.getNRSrc(); ++i) {
if (i) os << ", ";
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (i) os << ", ";
os << "imm=0x" << std::hex << instr.getImm();
}
if (opcode == GPU && func3 == 0) {
os << ", unit=" << std::dec << func2;
}
os << op_string(instr);
int sep = 0;
if (instr.getRDType() != RegType::None) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRDType() << std::dec << instr.getRDest();
}
for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {
if (instr.getRSType(i) == RegType::None)
continue;
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getImm();
}
if (opcode == Opcode::SYS_INST && func3 >= 5) {
// CSRs with immediate values
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getRSrc(0);
}
return os;
}
}
Decoder::Decoder(const ArchDef&) {}
Decoder::Decoder(const Arch&) {}
std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
auto instr = std::make_shared<Instr>();
@@ -434,7 +488,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
return nullptr;
}
auto iType = op_it->second.iType;
auto iType = op_it->second;
if (op == Opcode::FL || op == Opcode::FS) {
if (func3 != 0x2 && func3 != 0x3) {
iType = InstType::V_TYPE;
@@ -442,57 +496,97 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
switch (iType) {
case InstType::N_TYPE:
break;
case InstType::R_TYPE:
if (op == Opcode::FCI) {
switch (func7) {
switch (op) {
case Opcode::FCI:
switch (func7) {
case 0x2c: // FSQRT.S
case 0x2d: // FSQRT.D
instr->setDestReg(rd, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x50: // FLE.S, FLT.S, FEQ.S
case 0x51: // FLE.D, FLT.D, FEQ.D
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
case 0x60: // FCVT.W.D, FCVT.WU.D, FCVT.L.D, FCVT.LU.D
case 0x61: // FCVT.WU.S, FCVT.W.S, FCVT.L.S, FCVT.LU.S
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::None);
break;
case 0x68: // FCVT.S.W, FCVT.S.WU, FCVT.S.L, FCVT.S.LU
case 0x69: // FCVT.D.W, FCVT.D.WU, FCVT.D.L, FCVT.D.LU
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::None);
break;
case 0x70: // FCLASS.S, FMV.X.W
case 0x70: // FCLASS.S, FMV.X.S
case 0x71: // FCLASS.D, FMV.X.D
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x78: // FMV.W.X
case 0x78: // FMV.S.X
case 0x79: // FMV.D.X
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
}
} else {
break;
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
case 0: // TMC
case 3: // JOIN
instr->addSrcReg(rs1, RegType::Integer);
break;
case 1: // WSPAWN
case 4: // BAR
case 5: // PRED
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
case 2: // SPLIT
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
std::abort();
}
break;
case 1:
switch (func3) {
case 0: // RASTER
instr->setDestReg(rd, RegType::Integer);
break;
default:
std::abort();
}
break;
default:
std::abort();
}
break;
default:
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
}
instr->setFunc3(func3);
instr->setFunc7(func7);
break;
case InstType::I_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
if (op == Opcode::FL) {
instr->setDestReg(rd, RegType::Float);
} else {
@@ -503,15 +597,23 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
switch (op) {
case Opcode::SYS_INST:
if (func3 != 0) {
// RV32I: CSR*
instr->setDestReg(rd, RegType::Integer);
}
// RV32I: CSR
if (func3 >= 5) {
// rs1 holds zimm
instr->setSrcReg(0, rs1, RegType::None);
}
} else {
instr->setDestReg(rd, RegType::None);
instr->setSrcReg(0, rs1, RegType::None);
}
// uint12
instr->setImm(code >> shift_rs2);
break;
case Opcode::FENCE:
// uint12
instr->setImm(code >> shift_rs2);
instr->setDestReg(rd, RegType::None);
instr->setSrcReg(0, rs1, RegType::None);
break;
case Opcode::I_INST:
case Opcode::I_INST_W:
@@ -538,11 +640,11 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
} break;
case InstType::S_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
if (op == Opcode::FS) {
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
} else {
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
}
instr->setFunc3(func3);
auto imm = (func7 << width_reg) | rd;
@@ -550,8 +652,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
} break;
case InstType::B_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->setFunc3(func3);
auto bit_11 = rd & 0x1;
auto bits_4_1 = rd >> 1;
@@ -581,8 +683,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
case InstType::V_TYPE:
switch (op) {
case Opcode::VSET: {
instr->setDestVReg(rd);
instr->setSrcVReg(rs1);
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs1, RegType::Vector);
instr->setFunc3(func3);
if (func3 == 7) {
instr->setImm(!(code >> shift_vset));
@@ -593,20 +695,20 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
instr->setVediv((immed >> 4) & 0x3);
instr->setVsew((immed >> 2) & 0x3);
} else {
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
}
} else {
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask((code >> shift_func7) & 0x1);
instr->setFunc6(func6);
}
} break;
case Opcode::FL:
instr->setDestVReg(rd);
instr->setSrcVReg(rs1);
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs1, RegType::Vector);
instr->setVlsWidth(func3);
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask(code >> shift_func7);
instr->setVmop((code >> shift_vmop) & mask_func3);
instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -614,9 +716,9 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
case Opcode::FS:
instr->setVs3(rd);
instr->setSrcVReg(rs1);
instr->addSrcReg(rs1, RegType::Vector);
instr->setVlsWidth(func3);
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask(code >> shift_func7);
instr->setVmop((code >> shift_vmop) & mask_func3);
instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -627,16 +729,28 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
break;
case R4_TYPE:
if (op == Opcode::GPU) {
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->setSrcReg(rs3, RegType::Integer);
if (op == Opcode::EXT2) {
switch (func3) {
case 1:
switch (func2) {
case 0: // CMOV
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs3, RegType::Integer);
break;
default:
std::abort();
}
break;
default:
std::abort();
}
} else {
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->setSrcReg(rs3, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs3, RegType::Float);
}
instr->setFunc2(func2);
instr->setFunc3(func3);

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
@@ -5,12 +18,12 @@
namespace vortex {
class ArchDef;
class Arch;
class Instr;
class Decoder {
public:
Decoder(const ArchDef &);
Decoder(const Arch &);
std::shared_ptr<Instr> decode(uint32_t code) const;
};

141
sim/simx/dispatcher.h Normal file
View File

@@ -0,0 +1,141 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Dispatcher : public SimObject<Dispatcher> {
public:
std::vector<SimPort<pipeline_trace_t*>> Outputs;
Dispatcher(const SimContext& ctx, const Arch& arch, uint32_t buf_size, uint32_t block_size, uint32_t num_lanes)
: SimObject<Dispatcher>(ctx, "Dispatcher")
, Outputs(ISSUE_WIDTH, this)
, Inputs_(ISSUE_WIDTH, this)
, arch_(arch)
, queues_(ISSUE_WIDTH, std::queue<pipeline_trace_t*>())
, buf_size_(buf_size)
, block_size_(block_size)
, num_lanes_(num_lanes)
, batch_count_(ISSUE_WIDTH / block_size)
, pid_count_(arch.num_threads() / num_lanes)
, batch_idx_(0)
, start_p_(block_size, 0)
{}
virtual ~Dispatcher() {}
virtual void reset() {
batch_idx_ = 0;
for (uint32_t b = 0; b < block_size_; ++b) {
start_p_.at(b) = 0;
}
}
virtual void tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& queue = queues_.at(i);
if (queue.empty())
continue;
auto trace = queue.front();
Inputs_.at(i).send(trace, 1);
queue.pop();
}
uint32_t block_sent = 0;
for (uint32_t b = 0; b < block_size_; ++b) {
uint32_t i = batch_idx_ * block_size_ + b;
auto& input = Inputs_.at(i);
if (input.empty()) {
++block_sent;
continue;
}
auto& output = Outputs.at(i);
auto trace = input.front();
if (pid_count_ != 1) {
auto start_p = start_p_.at(b);
if (start_p == -1) {
++block_sent;
continue;
}
int start(-1), end(-1);
for (uint32_t j = start_p * num_lanes_, n = arch_.num_threads(); j < n; ++j) {
if (!trace->tmask.test(j))
continue;
if (start == -1)
start = j;
end = j;
}
start /= num_lanes_;
end /= num_lanes_;
auto new_trace = new pipeline_trace_t(*trace);
new_trace->tmask.reset();
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
new_trace->tmask[j] = trace->tmask[j];
}
new_trace->pid = start;
new_trace->sop = (start_p == 0);
if (start == end) {
new_trace->eop = 1;
start_p_.at(b) = -1;
input.pop();
++block_sent;
delete trace;
} else {
new_trace->eop = 0;
start_p_.at(b) = start + 1;
}
output.send(new_trace, 1);
DT(3, "pipeline-dispatch: " << *new_trace);
} else {
trace->pid = 0;
input.pop();
output.send(trace, 1);
DT(3, "pipeline-dispatch: " << *trace);
++block_sent;
}
}
if (block_sent == block_size_) {
batch_idx_ = (batch_idx_ + 1) % batch_count_;
for (uint32_t b = 0; b < block_size_; ++b) {
start_p_.at(b) = 0;
}
}
};
bool push(uint32_t issue_index, pipeline_trace_t* trace) {
auto& queue = queues_.at(issue_index);
if (queue.size() >= buf_size_)
return false;
queue.push(trace);
return true;
}
private:
std::vector<SimPort<pipeline_trace_t*>> Inputs_;
const Arch& arch_;
std::vector<std::queue<pipeline_trace_t*>> queues_;
uint32_t buf_size_;
uint32_t block_size_;
uint32_t num_lanes_;
uint32_t batch_count_;
uint32_t pid_count_;
uint32_t batch_idx_;
std::vector<int> start_p_;
};
}

341
sim/simx/exe_unit.cpp Normal file
View File

@@ -0,0 +1,341 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "exe_unit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"
#include "cache_sim.h"
using namespace vortex;
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->alu_type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::IMUL:
output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->fpu_type) {
case FpuType::FNCP:
output.send(trace, 2);
break;
case FpuType::FMA:
output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
auto time = input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, pending_rd_reqs_(LSUQ_SIZE)
, num_lanes_(NUM_LSU_LANES)
, pending_loads_(0)
, fence_lock_(false)
, input_idx_(0)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
pending_loads_ = 0;
fence_lock_ = false;
}
void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
// handle dcache response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type
<< ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
--pending_loads_;
}
// handle shared memory response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
--pending_loads_;
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
int iw = fence_state_->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
auto t0 = trace->pid * num_lanes_;
if (trace->lsu_type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break;
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->log_once(true)) {
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
}
break;
} else {
trace->log_once(false);
}
bool is_write = (trace->lsu_type == LsuType::STORE);
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(t0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t addr_count;
if (is_dup) {
addr_count = 1;
} else {
addr_count = trace->tmask.count();
}
auto tag = pending_rd_reqs_.allocate({trace, addr_count});
for (uint32_t t = 0; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto& dcache_req_port = core_->dcache_req_ports.at(t);
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = core_->get_addr_type(mem_addr.addr);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
++pending_loads_;
++core_->perf_stats_.loads;
if (is_dup)
break;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
output.send(trace, 1);
++core_->perf_stats_.stores;
}
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break; // single block
}
++input_idx_;
}
///////////////////////////////////////////////////////////////////////////////
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "SFU")
{}
void SfuUnit::tick() {
// handle pending responses
for (auto pending_rsp : pending_rsps_) {
if (pending_rsp->empty())
continue;
auto trace = pending_rsp->front();
if (trace->cid != core_->id())
continue;
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rsp->pop();
}
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto sfu_type = trace->sfu_type;
bool release_warp = trace->fetch_stall;
switch (sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::PRED:
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC:
output.send(trace, 1);
break;
case SfuType::BAR: {
output.send(trace, 1);
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
if (trace->eop) {
core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
}
release_warp = false;
} break;
case SfuType::CMOV:
output.send(trace, 3);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
auto stalls = (SimPlatform::instance().cycles() - time);
core_->perf_stats_.sfu_stalls += stalls;
break; // single block
}
++input_idx_;
}

View File

@@ -1,8 +1,21 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "pipeline.h"
#include "cache.h"
#include "cache_sim.h"
namespace vortex {
@@ -10,13 +23,13 @@ class Core;
class ExeUnit : public SimObject<ExeUnit> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
std::vector<SimPort<pipeline_trace_t*>> Inputs;
std::vector<SimPort<pipeline_trace_t*>> Outputs;
ExeUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<ExeUnit>(ctx, name)
, Input(this)
, Output(this)
, Inputs(ISSUE_WIDTH, this)
, Outputs(ISSUE_WIDTH, this)
, core_(core)
{}
@@ -32,28 +45,25 @@ protected:
///////////////////////////////////////////////////////////////////////////////
class NopUnit : public ExeUnit {
public:
NopUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class LsuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
pipeline_trace_t* fence_state_;
bool fence_lock_;
public:
LsuUnit(const SimContext& ctx, Core*);
void reset();
void tick();
private:
struct pending_req_t {
pipeline_trace_t* trace;
uint32_t count;
};
HashTable<pending_req_t> pending_rd_reqs_;
uint32_t num_lanes_;
pipeline_trace_t* fence_state_;
uint64_t pending_loads_;
bool fence_lock_;
uint32_t input_idx_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -67,15 +77,6 @@ public:
///////////////////////////////////////////////////////////////////////////////
class CsrUnit : public ExeUnit {
public:
CsrUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class FpuUnit : public ExeUnit {
public:
FpuUnit(const SimContext& ctx, Core*);
@@ -85,19 +86,15 @@ public:
///////////////////////////////////////////////////////////////////////////////
class GpuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
bool processTexRequest(pipeline_trace_t* trace);
class SfuUnit : public ExeUnit {
public:
GpuUnit(const SimContext& ctx, Core*);
void reset();
SfuUnit(const SimContext& ctx, Core*);
void tick();
private:
std::vector<SimPort<pipeline_trace_t*>*> pending_rsps_;
uint32_t input_idx_;
};
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,383 +0,0 @@
#include "exeunit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"
using namespace vortex;
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
void NopUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
Input.pop();
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, num_threads_(core->arch().num_threads())
, pending_rd_reqs_(LSUQ_SIZE)
, fence_lock_(false)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
fence_lock_ = false;
}
void LsuUnit::tick() {
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
// handle shared memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
Output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
if (Input.empty())
return;
auto trace = Input.front();
if (trace->lsu.type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
return;
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->suspend()) {
DT(3, "*** lsu-queue-stall: " << *trace);
}
return;
} else {
trace->resume();
}
bool is_write = (trace->lsu.type == LsuType::STORE);
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t valid_addrs = 0;
if (is_dup) {
valid_addrs = 1;
} else {
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
}
auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
auto mem_addr = trace->mem_addrs.at(t).at(0);
auto type = get_addr_type(mem_addr.addr, mem_addr.size);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.non_cacheable = (type == AddrType::IO);
mem_req.tag = tag;
mem_req.core_id = trace->cid;
mem_req.uuid = trace->uuid;
if (type == AddrType::Shared) {
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
} else {
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
}
if (is_dup)
break;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
Output.send(trace, 1);
}
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
switch (trace->alu.type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::CMOV:
Output.send(trace, 1);
break;
case AluType::IMUL:
Output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
Output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
void CsrUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
auto time = Input.pop();
core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
DT(3, "pipeline-execute: op=CSR, " << *trace);
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
switch (trace->fpu.type) {
case FpuType::FNCP:
Output.send(trace, 2);
break;
case FpuType::FMA:
Output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
Output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
Output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
Output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "GPU")
, num_threads_(core->arch().num_threads())
, pending_tex_reqs_(TEXQ_SIZE)
{}
void GpuUnit::reset() {
pending_tex_reqs_.clear();
}
void GpuUnit::tick() {
#ifdef EXT_TEX_ENABLE
// handle memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_tex_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
#endif
// check input queue
if (Input.empty())
return;
auto trace = Input.front();
bool issued = false;
switch (trace->gpu.type) {
case GpuType::TMC:
Output.send(trace, 1);
core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
issued = true;
break;
case GpuType::WSPAWN:
Output.send(trace, 1);
core_->active_warps_ = trace->gpu.active_warps;
issued = true;
break;
case GpuType::SPLIT:
case GpuType::JOIN:
Output.send(trace, 1);
issued = true;
break;
case GpuType::BAR:
Output.send(trace, 1);
if (trace->gpu.active_warps != 0)
core_->active_warps_ |= trace->gpu.active_warps;
else
core_->active_warps_.reset(trace->wid);
issued = true;
break;
case GpuType::TEX:
if (this->processTexRequest(trace))
issued = true;
break;
default:
std::abort();
}
if (issued) {
DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {
// check pending queue capacity
if (pending_tex_reqs_.full()) {
if (!trace->suspend()) {
DT(3, "*** tex-queue-stall: " << *trace);
}
return false;
} else {
trace->resume();
}
// send memory request
uint32_t valid_addrs = 0;
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
for (auto& mem_addr : trace->mem_addrs.at(t)) {
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = (trace->lsu.type == LsuType::STORE);
mem_req.tag = tag;
mem_req.core_id = core_->id();
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 3);
DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", tid=" << t << ", "<< trace);
++ core_->perf_stats_.tex_reads;
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
}
}
return true;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
@@ -6,10 +19,6 @@
namespace vortex {
class IBuffer {
private:
std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_;
public:
IBuffer(uint32_t size)
: capacity_(size)
@@ -39,6 +48,10 @@ public:
std::queue<pipeline_trace_t*> empty;
std::swap(entries_, empty );
}
private:
std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "types.h"
@@ -7,7 +20,7 @@ namespace vortex {
class Warp;
enum Opcode {
NOP = 0,
NONE = 0,
R_INST = 0x33,
L_INST = 0x3,
I_INST = 0x13,
@@ -19,6 +32,7 @@ enum Opcode {
JALR_INST = 0x67,
SYS_INST = 0x73,
FENCE = 0x0f,
AMO = 0x2f,
// F Extension
FL = 0x7,
FS = 0x27,
@@ -26,19 +40,20 @@ enum Opcode {
FMADD = 0x43,
FMSUB = 0x47,
FMNMSUB = 0x4b,
FMNMADD = 0x4f,
// Vector Extension
VSET = 0x57,
// GPGPU Extension
GPGPU = 0x6b,
GPU = 0x5b,
// RV64 Standard Extensions
FMNMADD = 0x4f,
// RV64 Standard Extension
R_INST_W = 0x3b,
I_INST_W = 0x1b,
// Vector Extension
VSET = 0x57,
// Custom Extensions
EXT1 = 0x0b,
EXT2 = 0x2b,
EXT3 = 0x5b,
EXT4 = 0x7b
};
enum InstType {
N_TYPE,
enum InstType {
R_TYPE,
I_TYPE,
S_TYPE,
@@ -52,25 +67,45 @@ enum InstType {
class Instr {
public:
Instr()
: opcode_(Opcode::NOP)
: opcode_(Opcode::NONE)
, num_rsrcs_(0)
, has_imm_(false)
, rdest_type_(RegType::None)
, imm_(0)
, rdest_(0)
, func2_(0)
, func3_(0)
, func6_(0)
, func7_(0) {
, func7_(0)
, vmask_(0)
, vlsWidth_(0)
, vMop_(0)
, vNf_(0)
, vs3_(0)
, vlmul_(0)
, vsew_(0)
, vediv_(0) {
for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
rsrc_type_[i] = RegType::None;
rsrc_[i] = 0;
}
}
void setOpcode(Opcode opcode) { opcode_ = opcode; }
void setDestReg(uint32_t destReg, RegType type) { rdest_type_ = type; rdest_ = destReg; }
void setSrcReg(uint32_t srcReg, RegType type) { rsrc_type_[num_rsrcs_] = type; rsrc_[num_rsrcs_++] = srcReg; }
void setDestVReg(uint32_t destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
void setSrcVReg(uint32_t srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
void setDestReg(uint32_t destReg, RegType type) {
rdest_type_ = type;
rdest_ = destReg;
}
void addSrcReg(uint32_t srcReg, RegType type) {
rsrc_type_[num_rsrcs_] = type;
rsrc_[num_rsrcs_] = srcReg;
++num_rsrcs_;
}
void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) {
rsrc_type_[index] = type;
rsrc_[index] = srcReg;
num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1);
}
void setFunc2(uint32_t func2) { func2_ = func2; }
void setFunc3(uint32_t func3) { func3_ = func3; }
void setFunc7(uint32_t func7) { func7_ = func7; }
@@ -85,17 +120,17 @@ public:
void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; }
void setFunc6(uint32_t func6) { func6_ = func6; }
Opcode getOpcode() const { return opcode_; }
Opcode getOpcode() const { return opcode_; }
uint32_t getFunc2() const { return func2_; }
uint32_t getFunc3() const { return func3_; }
uint32_t getFunc6() const { return func6_; }
uint32_t getFunc7() const { return func7_; }
uint32_t getNRSrc() const { return num_rsrcs_; }
uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
uint32_t getRDest() const { return rdest_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
uint32_t getImm() const { return imm_; }
uint32_t getVlsWidth() const { return vlsWidth_; }
uint32_t getVmop() const { return vMop_; }

View File

@@ -1,98 +1,132 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <fstream>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include "processor.h"
#include "archdef.h"
#include "mem.h"
#include "constants.h"
#include <util.h>
#include "args.h"
#include "core.h"
using namespace vortex;
static void show_usage() {
std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-r: riscv-test] [-s: stats] [-h: help] <program>" << std::endl;
}
uint32_t num_threads = NUM_THREADS;
uint32_t num_warps = NUM_WARPS;
uint32_t num_cores = NUM_CORES;
uint32_t num_clusters = NUM_CLUSTERS;
bool showStats = false;;
bool riscv_test = false;
const char* program = nullptr;
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
switch (c) {
case 't':
num_threads = atoi(optarg);
break;
case 'w':
num_warps = atoi(optarg);
break;
case 'c':
num_cores = atoi(optarg);
break;
case 'g':
num_clusters = atoi(optarg);
break;
case 'r':
riscv_test = true;
break;
case 's':
showStats = true;
break;
case 'h':
case '?':
show_usage();
exit(0);
break;
default:
show_usage();
exit(-1);
}
}
if (optind < argc) {
program = argv[optind];
std::cout << "Running " << program << "..." << std::endl;
} else {
show_usage();
exit(-1);
}
}
int main(int argc, char **argv) {
int exitcode = 0;
std::string imgFileName;
int num_cores(NUM_CORES * NUM_CLUSTERS);
int num_warps(NUM_WARPS);
int num_threads(NUM_THREADS);
bool showHelp(false);
bool showStats(false);
bool riscv_test(false);
parse_args(argc, argv);
// parse the command line arguments
CommandLineArgFlag fh("-h", "--help", "show command line options", showHelp);
CommandLineArgSetter<std::string> fi("-i", "--image", "program binary", imgFileName);
CommandLineArgSetter<int> fc("-c", "--cores", "number of cores", num_cores);
CommandLineArgSetter<int> fw("-w", "--warps", "number of warps", num_warps);
CommandLineArgSetter<int> ft("-t", "--threads", "number of threads", num_threads);
CommandLineArgFlag fr("-r", "--riscv", "enable riscv tests", riscv_test);
CommandLineArgFlag fs("-s", "--stats", "show stats", showStats);
CommandLineArg::readArgs(argc - 1, argv + 1);
if (showHelp || imgFileName.empty()) {
std::cout << "Vortex emulator command line arguments:\n"
" -i, --image <filename> Program RAM image\n"
" -c, --cores <num> Number of cores\n"
" -w, --warps <num> Number of warps\n"
" -t, --threads <num> Number of threads\n"
" -r, --riscv riscv test\n"
" -s, --stats Print stats on exit.\n";
return 0;
}
std::cout << "Running " << imgFileName << "..." << std::endl;
{
// create processor configuation
ArchDef arch(num_cores, num_warps, num_threads);
Arch arch(num_threads, num_warps, num_cores, num_clusters);
// create memory module
RAM ram(RAM_PAGE_SIZE);
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// setup base DCRs
const uint64_t startup_addr(STARTUP_ADDR);
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
#if (XLEN == 64)
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
#endif
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
// load program
{
std::string program_ext(fileExtension(imgFileName.c_str()));
{
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {
ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
ram.loadBinImage(program, startup_addr);
} else if (program_ext == "hex") {
ram.loadHexImage(imgFileName.c_str());
ram.loadHexImage(program);
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
}
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// run simulation
exitcode = processor.run();
exitcode = processor.run(riscv_test);
}
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed." << std::endl;
exitcode = 0;
} else {
std::cout << "Failed." << std::endl;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}

View File

@@ -1,4 +1,17 @@
#include "memsim.h"
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mem_sim.h"
#include <vector>
#include <queue>
#include <stdlib.h>
@@ -83,7 +96,7 @@ public:
mem_req.addr,
mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
mem_req.core_id
mem_req.cid
);
if (!dram_->send(dram_req))

View File

@@ -1,8 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "types.h"
#include <vector>
namespace vortex {

61
sim/simx/operand.h Normal file
View File

@@ -0,0 +1,61 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Operand : public SimObject<Operand> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
Operand(const SimContext& ctx)
: SimObject<Operand>(ctx, "Operand")
, Input(this)
, Output(this)
{}
virtual ~Operand() {}
virtual void reset() {}
virtual void tick() {
if (Input.empty())
return;
auto trace = Input.front();
int delay = 1;
for (int i = 0; i < MAX_NUM_REGS; ++i) {
bool is_iregs = trace->used_iregs.test(i);
bool is_fregs = trace->used_fregs.test(i);
bool is_vregs = trace->used_vregs.test(i);
if (is_iregs || is_fregs || is_vregs) {
if (is_iregs && i == 0)
continue;
++delay;
}
}
Output.send(trace, delay);
DT(3, "pipeline-operands: " << *trace);
Input.pop();
};
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
@@ -5,14 +18,38 @@
#include <iostream>
#include <util.h>
#include "types.h"
#include "archdef.h"
#include "arch.h"
#include "debug.h"
namespace vortex {
class ITraceData {
public:
using Ptr = std::shared_ptr<ITraceData>;
ITraceData() {}
virtual ~ITraceData() {}
};
struct LsuTraceData : public ITraceData {
using Ptr = std::shared_ptr<LsuTraceData>;
std::vector<mem_addr_size_t> mem_addrs;
LsuTraceData(uint32_t num_threads) : mem_addrs(num_threads) {}
};
struct SFUTraceData : public ITraceData {
using Ptr = std::shared_ptr<SFUTraceData>;
struct {
uint32_t id;
uint32_t count;
} bar;
SFUTraceData(uint32_t bar_id, uint32_t bar_count) : bar{bar_id, bar_count} {}
};
struct pipeline_trace_t {
public:
//--
uint64_t uuid;
const uint64_t uuid;
const Arch& arch;
//--
uint32_t cid;
@@ -21,12 +58,9 @@ struct pipeline_trace_t {
Word PC;
//--
bool fetch_stall;
//--
bool wb;
RegType rdest_type;
uint32_t rdest;
RegType rdest_type;
bool wb;
//--
RegMask used_iregs;
@@ -36,73 +70,104 @@ struct pipeline_trace_t {
//-
ExeType exe_type;
//--
std::vector<std::vector<mem_addr_size_t>> mem_addrs;
//--
union {
struct {
LsuType type;
} lsu;
struct {
AluType type;
} alu;
struct {
FpuType type;
} fpu;
struct {
GpuType type;
WarpMask active_warps;
} gpu;
uint32_t unit_type;
LsuType lsu_type;
AluType alu_type;
FpuType fpu_type;
SfuType sfu_type;
};
bool stalled;
ITraceData::Ptr data;
pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
uuid = uuid_;
cid = 0;
wid = 0;
tmask.reset();
PC = 0;
fetch_stall = false;
wb = false;
rdest = 0;
rdest_type = RegType::None;
used_iregs.reset();
used_fregs.reset();
used_vregs.reset();
exe_type = ExeType::NOP;
mem_addrs.resize(arch.num_threads());
stalled = false;
}
int pid;
bool sop;
bool eop;
bool suspend() {
bool old = stalled;
stalled = true;
bool fetch_stall;
pipeline_trace_t(uint64_t uuid, const Arch& arch)
: uuid(uuid)
, arch(arch)
, cid(0)
, wid(0)
, tmask(0)
, PC(0)
, rdest(0)
, rdest_type(RegType::None)
, wb(false)
, used_iregs(0)
, used_fregs(0)
, used_vregs(0)
, exe_type(ExeType::ALU)
, unit_type(0)
, data(nullptr)
, pid(-1)
, sop(true)
, eop(true)
, fetch_stall(false)
, log_once_(false)
{}
pipeline_trace_t(const pipeline_trace_t& rhs)
: uuid(rhs.uuid)
, arch(rhs.arch)
, cid(rhs.cid)
, wid(rhs.wid)
, tmask(rhs.tmask)
, PC(rhs.PC)
, rdest(rhs.rdest)
, rdest_type(rhs.rdest_type)
, wb(rhs.wb)
, used_iregs(rhs.used_iregs)
, used_fregs(rhs.used_fregs)
, used_vregs(rhs.used_vregs)
, exe_type(rhs.exe_type)
, unit_type(rhs.unit_type)
, data(rhs.data)
, pid(rhs.pid)
, sop(rhs.sop)
, eop(rhs.eop)
, fetch_stall(rhs.fetch_stall)
, log_once_(false)
{}
~pipeline_trace_t() {}
bool log_once(bool enable) {
bool old = log_once_;
log_once_ = enable;
return old;
}
void resume() {
stalled = false;
}
private:
bool log_once_;
};
inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
os << "cid=" << state.cid;
os << ", wid=" << state.wid;
os << ", tmask=";
for (uint32_t i = 0, n = state.arch.num_threads(); i < n; ++i) {
os << state.tmask.test(i);
}
os << ", PC=0x" << std::hex << state.PC;
os << ", wb=" << state.wb;
if (state.wb) {
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
}
os << ", ex=" << state.exe_type;
if (state.pid != -1) {
os << ", pid=" << state.pid;
os << ", sop=" << state.sop;
os << ", eop=" << state.eop;
}
os << " (#" << std::dec << state.uuid << ")";
return os;
}
class PipelineLatch {
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
public:
PipelineLatch(const char* name = nullptr)
: name_(name)
@@ -132,6 +197,10 @@ public:
std::queue<pipeline_trace_t*> empty;
std::swap(queue_, empty );
}
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
};
}

View File

@@ -1,168 +1,141 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "processor.h"
#include "core.h"
#include "constants.h"
#include "processor_impl.h"
using namespace vortex;
class Processor::Impl {
private:
std::vector<Core::Ptr> cores_;
std::vector<Cache::Ptr> l2caches_;
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
Cache::Ptr l3cache_;
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
ProcessorImpl::ProcessorImpl(const Arch& arch)
: arch_(arch)
, clusters_(arch.num_clusters())
{
SimPlatform::instance().initialize();
public:
Impl(const ArchDef& arch)
: cores_(arch.num_cores())
, l2caches_(NUM_CLUSTERS)
, l2_mem_switches_(NUM_CLUSTERS)
{
SimPlatform::instance().initialize();
// create memory simulator
memsim_ = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
uint32_t(arch.num_cores()) * arch.num_clusters()
});
uint32_t num_cores = arch.num_cores();
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
cores_.at(i) = Core::Create(arch, i);
// create L3 cache
l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
!L3_ENABLED,
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L3_NUM_WAYS), // W
0, // A
XLEN, // address bits
L3_NUM_BANKS, // number of banks
1, // number of ports
uint8_t(arch.num_clusters()), // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
// connect L3 memory ports
l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
memsim_->MemRspPort.bind(&l3cache_->MemRspPort);
// setup memory simulator
auto memsim = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
arch.num_cores()
});
std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", Cache::Config{
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
}
} else if (NUM_CLUSTERS > 1) {
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
}
}
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster);
std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", Cache::Config{
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
(uint8_t)cores_per_cluster, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
l2cache->MemReqPort.bind(mem_req_ports.at(i));
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
}
} else {
auto& l2_mem_switch = l2_mem_switches_.at(i);
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
}
}
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
auto& core = cores_.at((i * cores_per_cluster) + j);
core->MemReqPort.bind(cluster_mem_req_ports.at(j));
cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
}
}
// create clusters
for (uint32_t i = 0; i < arch.num_clusters(); ++i) {
clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_);
// connect L3 core ports
clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i));
l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port);
}
~Impl() {
SimPlatform::instance().finalize();
}
// set up memory perf recording
memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
__unused (cycle);
perf_mem_reads_ += !req.write;
perf_mem_writes_ += req.write;
perf_mem_pending_reads_ += !req.write;
});
memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
__unused (cycle);
--perf_mem_pending_reads_;
});
void attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
this->reset();
}
int run() {
SimPlatform::instance().reset();
bool running;
int exitcode = 0;
do {
SimPlatform::instance().tick();
running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
}
if (core->check_exit()) {
exitcode = core->getIRegValue(3);
running = false;
break;
ProcessorImpl::~ProcessorImpl() {
SimPlatform::instance().finalize();
}
void ProcessorImpl::attach_ram(RAM* ram) {
for (auto cluster : clusters_) {
cluster->attach_ram(ram);
}
}
int ProcessorImpl::run(bool riscv_test) {
SimPlatform::instance().reset();
this->reset();
bool done;
Word exitcode = 0;
do {
SimPlatform::instance().tick();
done = true;
for (auto cluster : clusters_) {
if (cluster->running()) {
Word ec;
if (cluster->check_exit(&ec, riscv_test)) {
exitcode |= ec;
} else {
done = false;
}
}
} while (running);
}
perf_mem_latency_ += perf_mem_pending_reads_;
} while (!done);
return exitcode;
}
};
return exitcode;
}
void ProcessorImpl::reset() {
perf_mem_reads_ = 0;
perf_mem_writes_ = 0;
perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0;
}
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
dcrs_.write(addr, value);
}
ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
ProcessorImpl::PerfStats perf;
perf.mem_reads = perf_mem_reads_;
perf.mem_writes = perf_mem_writes_;
perf.mem_latency = perf_mem_latency_;
perf.l3cache = l3cache_->perf_stats();
for (auto cluster : clusters_) {
perf.clusters += cluster->perf_stats();
}
return perf;
}
///////////////////////////////////////////////////////////////////////////////
Processor::Processor(const ArchDef& arch)
: impl_(new Impl(arch))
Processor::Processor(const Arch& arch)
: impl_(new ProcessorImpl(arch))
{}
Processor::~Processor() {
@@ -173,6 +146,10 @@ void Processor::attach_ram(RAM* mem) {
impl_->attach_ram(mem);
}
int Processor::run() {
return impl_->run();
int Processor::run(bool riscv_test) {
return impl_->run(riscv_test);
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {
return impl_->write_dcr(addr, value);
}

View File

@@ -1,22 +1,39 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class ArchDef;
class Arch;
class RAM;
class ProcessorImpl;
class Processor {
public:
Processor(const ArchDef& arch);
Processor(const Arch& arch);
~Processor();
void attach_ram(RAM* mem);
int run();
int run(bool riscv_test);
void write_dcr(uint32_t addr, uint32_t value);
private:
class Impl;
Impl* impl_;
ProcessorImpl* impl_;
};
}
}

66
sim/simx/processor_impl.h Normal file
View File

@@ -0,0 +1,66 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "mem_sim.h"
#include "cache_sim.h"
#include "constants.h"
#include "dcrs.h"
#include "cluster.h"
namespace vortex {
class ProcessorImpl {
public:
struct PerfStats {
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
CacheSim::PerfStats l3cache;
Cluster::PerfStats clusters;
PerfStats()
: mem_reads(0)
, mem_writes(0)
, mem_latency(0)
{}
};
ProcessorImpl(const Arch& arch);
~ProcessorImpl();
void attach_ram(RAM* mem);
int run(bool riscv_test);
void write_dcr(uint32_t addr, uint32_t value);
ProcessorImpl::PerfStats perf_stats() const;
private:
void reset();
const Arch& arch_;
std::vector<std::shared_ptr<Cluster>> clusters_;
DCRS dcrs_;
MemSim::Ptr memsim_;
CacheSim::Ptr l3cache_;
uint64_t perf_mem_reads_;
uint64_t perf_mem_writes_;
uint64_t perf_mem_latency_;
uint64_t perf_mem_pending_reads_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
@@ -6,20 +19,15 @@
namespace vortex {
class Scoreboard {
private:
public:
struct reg_use_t {
RegType type;
uint32_t reg;
uint64_t owner;
};
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
public:
Scoreboard(const ArchDef &arch)
Scoreboard(const Arch &arch)
: in_use_iregs_(arch.num_warps())
, in_use_fregs_(arch.num_warps())
, in_use_vregs_(arch.num_warps())
@@ -84,8 +92,7 @@ public:
}
void reserve(pipeline_trace_t* state) {
if (!state->wb)
return;
assert(state->wb);
switch (state->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).set(state->rdest);
@@ -105,8 +112,7 @@ public:
}
void release(pipeline_trace_t* state) {
if (!state->wb)
return;
assert(state->wb);
switch (state->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).reset(state->rdest);
@@ -123,6 +129,13 @@ public:
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
owners_.erase(tag);
}
private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
};
}

138
sim/simx/shared_mem.cpp Normal file
View File

@@ -0,0 +1,138 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "shared_mem.h"
#include "core.h"
#include <bitmanip.h>
#include <vector>
#include "types.h"
using namespace vortex;
class SharedMem::Impl {
protected:
SharedMem* simobject_;
Config config_;
RAM ram_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
uint64_t to_local_addr(uint64_t addr) {
uint32_t total_lines = config_.capacity / config_.line_size;
uint32_t line_bits = log2ceil(total_lines);
uint32_t offset = bit_getw(addr, 0, line_bits-1);
return offset;
}
public:
Impl(SharedMem* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, ram_(config.capacity, config.capacity)
, bank_sel_addr_start_(0)
, bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
{}
virtual ~Impl() {}
void reset() {
perf_stats_ = PerfStats();
}
void read(void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
ram_.read(data, s_addr, size);
}
void write(const void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
ram_.write(data, s_addr, size);
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = simobject_->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = 0;
if (bank_sel_addr_start_ <= bank_sel_addr_end_) {
bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
}
// bank conflict check
if (in_used_banks.at(bank_id)) {
++perf_stats_.bank_stalls;
continue;
}
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.cid};
simobject_->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
};
///////////////////////////////////////////////////////////////////////////////
SharedMem::SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, impl_(new Impl(this, config))
{}
SharedMem::~SharedMem() {
delete impl_;
}
void SharedMem::reset() {
impl_->reset();
}
void SharedMem::read(void* data, uint64_t addr, uint32_t size) {
impl_->read(data, addr, size);
}
void SharedMem::write(const void* data, uint64_t addr, uint32_t size) {
impl_->write(data, addr, size);
}
void SharedMem::tick() {
impl_->tick();
}
const SharedMem::PerfStats& SharedMem::perf_stats() const {
return impl_->perf_stats();
}

72
sim/simx/shared_mem.h Normal file
View File

@@ -0,0 +1,72 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "types.h"
namespace vortex {
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t capacity;
uint32_t line_size;
uint32_t num_reqs;
uint32_t num_banks;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->bank_stalls += rhs.bank_stalls;
return *this;
}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config);
virtual ~SharedMem();
void reset();
void read(void* data, uint64_t addr, uint32_t size);
void write(const void* data, uint64_t addr, uint32_t size);
void tick();
const PerfStats& perf_stats() const;
protected:
class Impl;
Impl* impl_;
};
}

View File

@@ -1,96 +0,0 @@
#pragma once
#include <simobject.h>
#include <bitmanip.h>
#include <vector>
#include "types.h"
namespace vortex {
class Core;
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t num_reqs;
uint32_t num_banks;
uint32_t bank_offset;
uint32_t latency;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, config_(config)
, bank_sel_addr_start_(config.bank_offset)
, bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
{}
virtual ~SharedMem() {}
void reset() {
perf_stats_ = PerfStats();
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = this->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = (uint32_t)bit_getw(
core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
// bank conflict check
if (in_used_banks.at(bank_id))
continue;
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.core_id};
this->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
protected:
Config config_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
};
}

View File

@@ -1,100 +0,0 @@
#include "tex_unit.h"
#include "core.h"
#include <texturing.h>
#include <VX_config.h>
using namespace vortex;
using namespace cocogfx;
enum class FilterMode {
Point,
Bilinear,
Trilinear,
};
TexUnit::TexUnit(Core* core) : core_(core) {}
TexUnit::~TexUnit() {}
void TexUnit::clear() {
for (auto& state : states_) {
state = 0;
}
}
uint32_t TexUnit::get_state(uint32_t state) {
return states_.at(state);
}
void TexUnit::set_state(uint32_t state, uint32_t value) {
states_.at(state) = value;
}
uint32_t TexUnit::read(int32_t u,
int32_t v,
int32_t lod,
std::vector<mem_addr_size_t>* mem_addrs) {
//--
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
uint32_t log_width = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
auto format = (TexFormat)states_.at(TEX_STATE_FORMAT);
auto filter = (FilterMode)states_.at(TEX_STATE_FILTER);
auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU);
auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV);
auto stride = Stride(format);
switch (filter) {
case FilterMode::Bilinear: {
// addressing
uint32_t offset00, offset01, offset10, offset11;
uint32_t alpha, beta;
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
uint32_t addr00 = base_addr + offset00 * stride;
uint32_t addr01 = base_addr + offset01 * stride;
uint32_t addr10 = base_addr + offset10 * stride;
uint32_t addr11 = base_addr + offset11 * stride;
// memory lookup
uint32_t texel00(0), texel01(0), texel10(0), texel11(0);
core_->dcache_read(&texel00, addr00, stride);
core_->dcache_read(&texel01, addr01, stride);
core_->dcache_read(&texel10, addr10, stride);
core_->dcache_read(&texel11, addr11, stride);
mem_addrs->push_back({addr00, stride});
mem_addrs->push_back({addr01, stride});
mem_addrs->push_back({addr10, stride});
mem_addrs->push_back({addr11, stride});
// filtering
auto color = TexFilterLinear(
format, texel00, texel01, texel10, texel11, alpha, beta);
return color;
}
case FilterMode::Point: {
// addressing
uint32_t offset;
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
uint32_t addr = base_addr + offset * stride;
// memory lookup
uint32_t texel(0);
core_->dcache_read(&texel, addr, stride);
mem_addrs->push_back({addr, stride});
// filtering
auto color = TexFilterPoint(format, texel);
return color;
}
default:
std::abort();
return 0;
}
}

View File

@@ -1,28 +0,0 @@
#pragma once
#include "types.h"
namespace vortex {
class Core;
class TexUnit {
public:
TexUnit(Core* core);
~TexUnit();
void clear();
uint32_t get_state(uint32_t state);
void set_state(uint32_t state, uint32_t value);
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
private:
std::array<uint32_t, NUM_TEX_STATES> states_;
Core* core_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
@@ -5,31 +18,42 @@
#include <queue>
#include <unordered_map>
#include <util.h>
#include <stringutil.h>
#include <VX_config.h>
#include <simobject.h>
#include "uuid_gen.h"
#include "debug.h"
namespace vortex {
typedef uint8_t Byte;
#if XLEN == 32
#if (XLEN == 32)
typedef uint32_t Word;
typedef int32_t WordI;
typedef uint64_t DWord;
typedef int64_t DWordI;
#elif XLEN == 64
typedef uint32_t WordF;
#elif (XLEN == 64)
typedef uint64_t Word;
typedef int64_t WordI;
typedef __uint128_t DWord;
typedef __int128_t DWordI;
typedef uint64_t WordF;
#else
#error unsupported XLEN
#endif
typedef uint64_t FWord;
#define MAX_NUM_CORES 1024
#define MAX_NUM_THREADS 32
#define MAX_NUM_WARPS 32
#define MAX_NUM_REGS 32
typedef std::bitset<32> RegMask;
typedef std::bitset<32> ThreadMask;
typedef std::bitset<32> WarpMask;
typedef std::bitset<MAX_NUM_CORES> CoreMask;
typedef std::bitset<MAX_NUM_REGS> RegMask;
typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
typedef std::bitset<MAX_NUM_WARPS> WarpMask;
typedef std::unordered_map<uint32_t, uint32_t> CSRs;
///////////////////////////////////////////////////////////////////////////////
@@ -40,8 +64,8 @@ enum class RegType {
Vector
};
inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
switch (clss) {
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
switch (type) {
case RegType::None: break;
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
@@ -53,23 +77,19 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
///////////////////////////////////////////////////////////////////////////////
enum class ExeType {
NOP,
ALU,
LSU,
CSR,
FPU,
GPU,
SFU,
MAX,
};
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
switch (type) {
case ExeType::NOP: os << "NOP"; break;
case ExeType::ALU: os << "ALU"; break;
case ExeType::LSU: os << "LSU"; break;
case ExeType::CSR: os << "CSR"; break;
case ExeType::FPU: os << "FPU"; break;
case ExeType::GPU: os << "GPU"; break;
case ExeType::SFU: os << "SFU"; break;
case ExeType::MAX: break;
}
return os;
@@ -82,8 +102,7 @@ enum class AluType {
BRANCH,
SYSCALL,
IMUL,
IDIV,
CMOV,
IDIV
};
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -93,7 +112,6 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
case AluType::SYSCALL: os << "SYSCALL"; break;
case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break;
case AluType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -103,16 +121,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
enum class LsuType {
LOAD,
STORE,
FENCE,
PREFETCH,
FENCE
};
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
switch (type) {
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
case LsuType::PREFETCH: os << "PREFETCH"; break;
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
}
return os;
}
@@ -141,21 +157,6 @@ struct mem_addr_size_t {
uint32_t size;
};
inline AddrType get_addr_type(Word addr, uint32_t size) {
__unused (size);
if (SM_ENABLE) {
if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
&& addr < SMEM_BASE_ADDR) {
assert((addr + size) <= SMEM_BASE_ADDR);
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
return AddrType::Global;
}
///////////////////////////////////////////////////////////////////////////////
enum class FpuType {
@@ -179,23 +180,37 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
///////////////////////////////////////////////////////////////////////////////
enum class GpuType {
enum class SfuType {
TMC,
WSPAWN,
SPLIT,
JOIN,
BAR,
PRED,
CSRRW,
CSRRS,
CSRRC,
TEX,
RASTER,
ROP,
CMOV
};
inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
switch (type) {
case GpuType::TMC: os << "TMC"; break;
case GpuType::WSPAWN: os << "WSPAWN"; break;
case GpuType::SPLIT: os << "SPLIT"; break;
case GpuType::JOIN: os << "JOIN"; break;
case GpuType::BAR: os << "BAR"; break;
case GpuType::TEX: os << "TEX"; break;
case SfuType::TMC: os << "TMC"; break;
case SfuType::WSPAWN: os << "WSPAWN"; break;
case SfuType::SPLIT: os << "SPLIT"; break;
case SfuType::JOIN: os << "JOIN"; break;
case SfuType::BAR: os << "BAR"; break;
case SfuType::PRED: os << "PRED"; break;
case SfuType::CSRRW: os << "CSRRW"; break;
case SfuType::CSRRS: os << "CSRRS"; break;
case SfuType::CSRRC: os << "CSRRC"; break;
case SfuType::TEX: os << "TEX"; break;
case SfuType::RASTER: os << "RASTER"; break;
case SfuType::ROP: os << "ROP"; break;
case SfuType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -218,31 +233,32 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
///////////////////////////////////////////////////////////////////////////////
struct MemReq {
uint64_t addr;
bool write;
bool non_cacheable;
uint32_t tag;
uint32_t core_id;
uint64_t uuid;
uint64_t addr;
bool write;
AddrType type;
uint32_t tag;
uint32_t cid;
uint64_t uuid;
MemReq(uint64_t _addr = 0,
bool _write = false,
bool _non_cacheable = false,
uint64_t _tag = 0,
uint32_t _core_id = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, non_cacheable(_non_cacheable)
, tag(_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
MemReq(uint64_t _addr = 0,
bool _write = false,
AddrType _type = AddrType::Global,
uint64_t _tag = 0,
uint32_t _cid = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, type(_type)
, tag(_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
os << "mem-" << (req.write ? "wr" : "rd") << ": ";
os << "addr=" << std::hex << req.addr << std::dec << ", tag=" << req.tag << ", core_id=" << req.core_id;
os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
os << " (#" << std::dec << req.uuid << ")";
return os;
}
@@ -250,18 +266,19 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
///////////////////////////////////////////////////////////////////////////////
struct MemRsp {
uint64_t tag;
uint32_t core_id;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
: tag (_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
uint64_t tag;
uint32_t cid;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
: tag (_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
os << "mem-rsp: tag=" << rsp.tag << ", cid=" << rsp.cid;
os << " (#" << std::dec << rsp.uuid << ")";
return os;
}
@@ -270,10 +287,6 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
template <typename T>
class HashTable {
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
public:
HashTable(uint32_t capacity)
: entries_(capacity)
@@ -336,92 +349,180 @@ public:
}
size_ = 0;
}
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
template <typename Req, typename Rsp>
class Switch : public SimObject<Switch<Req, Rsp>> {
private:
ArbiterType type_;
uint32_t delay_;
uint32_t cursor_;
uint32_t tag_shift_;
public:
std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn;
std::vector<SimPort<Req>> ReqOut;
std::vector<SimPort<Rsp>> RspOut;
Switch(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_inputs = 1,
uint32_t num_outputs = 1,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
: SimObject<Switch<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqOut(num_outputs, this)
, RspOut(num_outputs, this)
, type_(type)
, delay_(delay)
, cursor_(0)
, tag_shift_(log2ceil(num_inputs))
, ReqIn(num_inputs, this)
, ReqOut(this)
, RspIn(this)
, RspOut(num_inputs, this)
, cursors_(num_outputs, 0)
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
{
assert(delay_ != 0);
assert(num_inputs <= MaxInputs);
if (num_inputs == 1) {
// bypass
ReqIn.at(0).bind(&ReqOut);
RspIn.bind(&RspOut.at(0));
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
if (num_inputs == num_outputs) {
// bypass mode
for (uint32_t i = 0; i < num_inputs; ++i) {
ReqIn.at(i).bind(&ReqOut.at(i));
RspOut.at(i).bind(&RspIn.at(i));
}
}
}
void reset() {
cursor_ = 0;
for (auto& cursor : cursors_) {
cursor = 0;
}
}
void tick() {
if (ReqIn.size() == 1)
void tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
uint32_t R = 1 << lg_num_reqs_;
// skip bypass mode
if (I == O)
return;
// process incomming requests
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
uint32_t j = (cursor_ + i) % n;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
// process incomming requests
for (uint32_t o = 0; o < O; ++o) {
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (lg_num_reqs_ != 0) {
req.tag = (req.tag << lg_num_reqs_) | i;
}
DT(4, this->name() << "-" << req);
ReqOut.at(o).send(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
}
ReqOut.send(req, delay_);
req_in.pop();
this->update_cursor(j);
break;
}
}
// process incoming reponses
if (!RspIn.empty()) {
auto& rsp = RspIn.front();
uint32_t port_id = 0;
if (tag_shift_) {
port_id = rsp.tag & ((1 << tag_shift_)-1);
rsp.tag >>= tag_shift_;
}
RspOut.at(port_id).send(rsp, 1);
RspIn.pop();
// process incoming reponses
if (!RspOut.at(o).empty()) {
auto& rsp = RspOut.at(o).front();
uint32_t i = 0;
if (lg_num_reqs_ != 0) {
i = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
}
DT(4, this->name() << "-" << rsp);
uint32_t j = o * R + i;
RspIn.at(j).send(rsp, 1);
RspOut.at(o).pop();
}
}
}
void update_cursor(uint32_t grant) {
void update_cursor(uint32_t index, uint32_t grant) {
if (type_ == ArbiterType::RoundRobin) {
cursor_ = grant + 1;
cursors_.at(index) = grant + 1;
}
}
std::vector<SimPort<Req>> ReqIn;
SimPort<Req> ReqOut;
SimPort<Rsp> RspIn;
std::vector<SimPort<Rsp>> RspOut;
private:
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t lg_num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
class SMemDemux : public SimObject<SMemDemux> {
public:
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqSm;
SimPort<MemRsp> RspSm;
SimPort<MemReq> ReqDc;
SimPort<MemRsp> RspDc;
SMemDemux(
const SimContext& ctx,
const char* name,
uint32_t delay = 1
) : SimObject<SMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSm(this)
, RspSm(this)
, ReqDc(this)
, RspDc(this)
, delay_(delay)
{}
void reset() {}
void tick() {
// process incomming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSm.send(req, delay_);
} else {
ReqDc.send(req, delay_);
}
ReqIn.pop();
}
// process incoming reponses
if (!RspSm.empty()) {
auto& rsp = RspSm.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSm.pop();
}
if (!RspDc.empty()) {
auto& rsp = RspDc.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDc.pop();
}
}
private:
uint32_t delay_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
@@ -10,21 +23,25 @@
using namespace vortex;
Warp::Warp(Core *core, uint32_t id)
: id_(id)
Warp::Warp(Core *core, uint32_t warp_id)
: warp_id_(warp_id)
, arch_(core->arch())
, core_(core)
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
, vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
{
this->clear();
this->reset();
}
void Warp::clear() {
active_ = false;
PC_ = STARTUP_ADDR;
void Warp::reset() {
PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
#if (XLEN == 64)
PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
#endif
tmask_.reset();
for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i) {
issued_instrs_ = 0;
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
for (auto& reg : ireg_file_.at(i)) {
reg = 0;
}
@@ -35,31 +52,44 @@ void Warp::clear() {
reg = 0;
}
}
uui_gen_.reset();
}
void Warp::eval(pipeline_trace_t *trace) {
pipeline_trace_t* Warp::eval() {
assert(tmask_.any());
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i)
DPN(2, tmask_.test(n-i-1));
DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
/* Fetch and decode. */
#ifndef NDEBUG
uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
#else
uint64_t uuid = 0;
#endif
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
DPN(1, tmask_.test(i));
DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);
// Fetch
uint32_t instr_code = 0;
core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
auto instr = core_->decoder().decode(instr_code);
// Decode
auto instr = core_->decoder_.decode(instr_code);
if (!instr) {
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
std::abort();
}
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);
// Update trace
// Create trace
auto trace = new pipeline_trace_t(uuid, arch_);
trace->cid = core_->id();
trace->wid = id_;
trace->wid = warp_id_;
trace->PC = PC_;
trace->tmask = tmask_;
trace->rdest = instr->getRDest();
@@ -68,18 +98,20 @@ void Warp::eval(pipeline_trace_t *trace) {
// Execute
this->execute(*instr, trace);
DP(4, "Register state:");
for (uint32_t i = 0; i < core_->arch().num_regs(); ++i) {
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
DP(5, "Register state:");
for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
// Integer register file
for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, '|');
DPN(5, '|');
// Floating point register file
for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, std::endl);
DPN(5, std::endl);
}
return trace;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __WARP_H
#define __WARP_H
@@ -7,28 +20,26 @@
namespace vortex {
class Arch;
class Core;
class Instr;
class pipeline_trace_t;
struct DomStackEntry {
DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask)
, PC(PC)
, fallThrough(false)
, unanimous(false)
, fallthrough(false)
{}
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, PC(0)
, fallThrough(true)
, unanimous(false)
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, fallthrough(true)
{}
ThreadMask tmask;
Word PC;
bool fallThrough;
bool unanimous;
bool fallthrough;
};
struct vtype {
@@ -40,72 +51,58 @@ struct vtype {
class Warp {
public:
Warp(Core *core, uint32_t id);
Warp(Core *core, uint32_t warp_id);
void clear();
bool active() const {
return active_;
}
void suspend() {
active_ = false;
}
void activate() {
active_ = true;
}
std::size_t getActiveThreads() const {
if (active_)
return tmask_.count();
return 0;
}
void reset();
uint32_t id() const {
return id_;
return warp_id_;
}
uint32_t getPC() const {
Word getPC() const {
return PC_;
}
void setPC(uint32_t PC) {
void setPC(Word PC) {
PC_ = PC;
}
void setTmask(size_t index, bool value) {
tmask_.set(index, value);
active_ = tmask_.any();
}
uint32_t getTmask() const {
if (active_)
return tmask_.to_ulong();
return 0;
uint64_t getTmask() const {
return tmask_.to_ulong();
}
uint32_t getIRegValue(uint32_t reg) const {
Word getIRegValue(uint32_t reg) const {
return ireg_file_.at(0).at(reg);
}
void eval(pipeline_trace_t *);
uint64_t incr_instrs() {
return issued_instrs_++;
}
pipeline_trace_t* eval();
private:
void execute(const Instr &instr, pipeline_trace_t *trace);
UUIDGenerator uui_gen_;
uint32_t id_;
uint32_t warp_id_;
const Arch& arch_;
Core *core_;
bool active_;
uint64_t issued_instrs_;
Word PC_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<FWord>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> dom_stack_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<uint64_t>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> ipdom_stack_;
struct vtype vtype_;
uint32_t vl_;