Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -1,45 +1,36 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../hw/rtl
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I. -I../common -I../../hw
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
CXXFLAGS += $(CONFIGS)
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
VPATH := $(sort $(dir $(SRCS)))
#$(info OBJS is $(OBJS))
#$(info VPATH is $(VPATH))
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer
else
CXXFLAGS += -O2 -DNDEBUG
endif
# XLEN parameterization
ifdef XLEN
CXXFLAGS += -DXLEN=$(XLEN)
endif
PROJECT = simx
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
$(CXX) $(CXXFLAGS) -DSTARTUP_ADDR=0x80000000 $^ $(LDFLAGS) -o $@
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@
@@ -48,4 +39,4 @@ $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
rm -rf $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so

87
sim/simx/arch.h Normal file
View File

@@ -0,0 +1,87 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <sstream>
#include <cstdlib>
#include <stdio.h>
#include "types.h"
namespace vortex {
class Arch {
private:
uint16_t num_threads_;
uint16_t num_warps_;
uint16_t num_cores_;
uint16_t num_clusters_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
uint16_t ipdom_size_;
public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)
: num_threads_(num_threads)
, num_warps_(num_warps)
, num_cores_(num_cores)
, num_clusters_(num_clusters)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
, ipdom_size_((num_threads-1) * 2)
{}
uint16_t vsize() const {
return vsize_;
}
uint16_t num_regs() const {
return num_regs_;
}
uint16_t num_csrs() const {
return num_csrs_;
}
uint16_t num_barriers() const {
return num_barriers_;
}
uint16_t ipdom_size() const {
return ipdom_size_;
}
uint16_t num_threads() const {
return num_threads_;
}
uint16_t num_warps() const {
return num_warps_;
}
uint16_t num_cores() const {
return num_cores_;
}
uint16_t num_clusters() const {
return num_clusters_;
}
};
}

View File

@@ -1,70 +0,0 @@
#pragma once
#include <string>
#include <sstream>
#include <cstdlib>
#include <stdio.h>
#include "types.h"
namespace vortex {
class ArchDef {
private:
uint16_t num_cores_;
uint16_t num_warps_;
uint16_t num_threads_;
uint16_t wsize_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
public:
ArchDef(uint16_t num_cores,
uint16_t num_warps,
uint16_t num_threads)
: num_cores_(num_cores)
, num_warps_(num_warps)
, num_threads_(num_threads)
, wsize_(4)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
{}
uint16_t wsize() const {
return wsize_;
}
uint16_t vsize() const {
return vsize_;
}
uint16_t num_regs() const {
return num_regs_;
}
uint16_t num_csrs() const {
return num_csrs_;
}
uint16_t num_barriers() const {
return num_barriers_;
}
uint16_t num_threads() const {
return num_threads_;
}
uint16_t num_warps() const {
return num_warps_;
}
uint16_t num_cores() const {
return num_cores_;
}
};
}

View File

@@ -1,47 +0,0 @@
#include <iostream>
#include <string>
#include "args.h"
using namespace vortex;
using std::string;
std::string CommandLineArg::helpString_;
std::unordered_map<string, CommandLineArg *> CommandLineArg::longArgs_;
std::unordered_map<string, CommandLineArg *> CommandLineArg::shortArgs_;
CommandLineArg::CommandLineArg(string s, string l, const char *helpText) {
helpString_ += helpText;
longArgs_[l] = this;
shortArgs_[s] = this;
}
CommandLineArg::CommandLineArg(string l, const char *helpText) {
helpString_ += helpText;
longArgs_[l] = this;
}
void CommandLineArg::readArgs(int argc, char **argv) {
for (int i = 0; i < argc; i++) {
std::unordered_map<string, CommandLineArg *>::iterator
s = shortArgs_.find(std::string(argv[i])),
l = longArgs_.find(std::string(argv[i]));
if (s != shortArgs_.end()) {
i += s->second->read(argc - i, &argv[i]);
} else if (l != longArgs_.end()) {
i += l->second->read(argc - i, &argv[i]);
} else {
throw BadArg(string(argv[i]));
}
}
}
void CommandLineArg::clearArgs() {
shortArgs_.clear();
longArgs_.clear();
helpString_ = "";
}
void CommandLineArg::showHelp(std::ostream &os) {
os << helpString_;
}

View File

@@ -1,64 +0,0 @@
#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <util.h>
namespace vortex {
struct BadArg { BadArg(std::string s) : arg(s) {} std::string arg; };
class CommandLineArg {
public:
CommandLineArg(std::string s, std::string l, const char *helpText);
CommandLineArg(std::string l, const char *helpText);
virtual int read(int argc, char** argv) = 0;
static void readArgs(int argc, char **argv);
static void clearArgs();
static void showHelp(std::ostream &os);
private:
static std::string helpString_;
static std::unordered_map<std::string, CommandLineArg *> longArgs_;
static std::unordered_map<std::string, CommandLineArg *> shortArgs_;
};
template <typename T> class CommandLineArgSetter : public CommandLineArg {
public:
CommandLineArgSetter(std::string s, std::string l, const char *ht, T &x) :
CommandLineArg(s, l, ht), arg_(x) {}
CommandLineArgSetter(std::string l, const char *ht, T &x) :
CommandLineArg(l, ht), arg_(x) {}
int read(int argc, char **argv) {
__unused (argc);
std::istringstream iss(argv[1]);
iss >> arg_;
return 1;
}
private:
T &arg_;
};
class CommandLineArgFlag : public CommandLineArg {
public:
CommandLineArgFlag(std::string s, std::string l, const char *ht, bool &x) :
CommandLineArg(s, l, ht), arg_(x) { arg_ = false; }
CommandLineArgFlag(std::string l, const char *ht, bool &x) :
CommandLineArg(l, ht), arg_(x) { arg_ = false; }
int read(int argc, char **argv) {
__unused (argc, argv);
arg_ = true;
return 0;
}
private:
bool &arg_;
};
}

View File

@@ -1,637 +0,0 @@
#include "cache.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t blocks_per_set;
uint32_t words_per_block;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const Cache::Config& config) {
uint32_t bank_bits = log2ceil(config.num_banks);
uint32_t offset_bits = config.B - config.W;
uint32_t log2_bank_size = config.C - bank_bits;
uint32_t index_bits = log2_bank_size - (config.B << config.A);
assert(log2_bank_size >= config.B);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_block = 1 << offset_bits;
this->blocks_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_block);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct block_t {
bool valid;
bool dirty;
uint64_t tag;
uint32_t lru_ctr;
};
struct set_t {
std::vector<block_t> blocks;
set_t(uint32_t size) : blocks(size) {}
void clear() {
for (auto& block : blocks) {
block.valid = false;
}
}
};
struct bank_req_info_t {
bool valid;
uint32_t req_id;
uint64_t req_tag;
};
struct bank_req_t {
bool valid;
bool write;
bool mshr_replay;
uint64_t tag;
uint32_t set_id;
uint32_t core_id;
uint64_t uuid;
std::vector<bank_req_info_t> infos;
bank_req_t(uint32_t size)
: valid(false)
, write(false)
, mshr_replay(false)
, tag(0)
, set_id(0)
, core_id(0)
, uuid(0)
, infos(size)
{}
};
struct mshr_entry_t : public bank_req_t {
uint32_t block_id;
mshr_entry_t(uint32_t size = 0)
: bank_req_t(size)
, block_id(0)
{}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t size_;
public:
MSHR(uint32_t size)
: entries_(size)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
int lookup(const bank_req_t& bank_req) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.valid
&& entry.set_id == bank_req.set_id
&& entry.tag == bank_req.tag) {
return i;
}
}
return -1;
}
int allocate(const bank_req_t& bank_req, uint32_t block_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (!entry.valid) {
*(bank_req_t*)&entry = bank_req;
entry.valid = true;
entry.mshr_replay = false;
entry.block_id = block_id;
++size_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.valid);
// make all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.valid
&& entry.set_id == root_entry.set_id
&& entry.tag == root_entry.tag) {
entry.mshr_replay = true;
}
}
return root_entry;
}
bool pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
*out = entry;
entry.valid = false;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
entry.valid = false;
}
}
size_ = 0;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const Cache::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.blocks_per_set)
, mshr(config.mshr_size)
{}
void clear() {
mshr.clear();
for (auto& set : sets) {
set.clear();
}
}
};
///////////////////////////////////////////////////////////////////////////////
class Cache::Impl {
private:
Cache* const simobject_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr mem_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
uint32_t flush_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(Cache* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
{
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspIn);
if (config.num_banks > 1) {
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
}
mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
} else {
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate tag flush cycles
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
}
void reset() {
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
// wait on flush cycles
if (flush_cycles_ != 0) {
--flush_cycles_;
return;
}
// per-bank pipeline request
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
// handle bypasss responses
auto& bypass_port = bypass_switch_->RspOut.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.front();
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
bypass_port.pop();
}
// handle MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// handle memory fills
std::vector<bool> pending_fill_req(config_.num_banks, false);
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (!mem_rsp_port.empty()) {
auto& mem_rsp = mem_rsp_port.front();
this->processMemoryFill(bank_id, mem_rsp.tag);
pending_fill_req.at(bank_id) = true;
mem_rsp_port.pop();
}
}
// handle incoming core requests
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.non_cacheable) {
// send IO request
this->processIORequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
// create bank request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.valid = true;
bank_req.write = core_req.write;
bank_req.mshr_replay = false;
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.core_id = core_req.core_id;
bank_req.uuid = core_req.uuid;
bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
// check pending MSHR replay
if (pipeline_req.valid
&& pipeline_req.mshr_replay) {
// stall
continue;
}
// check pending fill request
if (pending_fill_req.at(bank_id)) {
// stall
continue;
}
// check MSHR capacity if read or writeback
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
continue;
}
// check bank conflicts
if (pipeline_req.valid) {
// check port conflict
if (pipeline_req.write != core_req.write
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.infos[port_id].valid) {
++perf_stats_.bank_stalls;
continue;
}
// update pending request infos
pipeline_req.infos[port_id] = bank_req.infos[port_id];
} else {
// schedule new request
pipeline_req = bank_req;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequest(pipeline_reqs);
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processIORequest(const MemReq& core_req, uint32_t req_id) {
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
// update block
auto& bank = banks_.at(bank_id);
auto& entry = bank.mshr.replay(mshr_id);
auto& set = bank.sets.at(entry.set_id);
auto& block = set.blocks.at(entry.block_id);
block.valid = true;
block.tag = entry.tag;
--pending_fill_reqs_;
}
void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& pipeline_req = pipeline_reqs.at(bank_id);
if (!pipeline_req.valid)
continue;
auto& bank = banks_.at(bank_id);
auto& set = bank.sets.at(pipeline_req.set_id);
if (pipeline_req.mshr_replay) {
// send core response
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
} else {
bool hit = false;
bool found_free_block = false;
uint32_t hit_block_id = 0;
uint32_t repl_block_id = 0;
uint32_t max_cnt = 0;
for (uint32_t i = 0, n = set.blocks.size(); i < n; ++i) {
auto& block = set.blocks.at(i);
if (block.valid) {
if (block.tag == pipeline_req.tag) {
block.lru_ctr = 0;
hit_block_id = i;
hit = true;
} else {
++block.lru_ctr;
}
if (max_cnt < block.lru_ctr) {
max_cnt = block.lru_ctr;
repl_block_id = i;
}
} else {
found_free_block = true;
repl_block_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
auto& hit_block = set.blocks.at(hit_block_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
} else {
// mark block as dirty
hit_block.dirty = true;
}
}
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_block && !config_.write_through) {
// write back dirty block
auto& repl_block = set.blocks.at(repl_block_id);
if (repl_block.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++perf_stats_.evictions;
}
}
if (pipeline_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
// MSHR lookup
int pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
// send fill request
if (pending == -1) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++pending_fill_reqs_;
}
}
}
}
}
}
};
///////////////////////////////////////////////////////////////////////////////
Cache::Cache(const SimContext& ctx, const char* name, const Config& config)
: SimObject<Cache>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
Cache::~Cache() {
delete impl_;
}
void Cache::reset() {
impl_->reset();
}
void Cache::tick() {
impl_->tick();
}
const Cache::PerfStats& Cache::perf_stats() const {
return impl_->perf_stats();
}

106
sim/simx/cache_cluster.h Normal file
View File

@@ -0,0 +1,106 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "cache_sim.h"
namespace vortex {
class CacheCluster : public SimObject<CacheCluster> {
public:
std::vector<std::vector<SimPort<MemReq>>> CoreReqPorts;
std::vector<std::vector<SimPort<MemRsp>>> CoreRspPorts;
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
CacheCluster(const SimContext& ctx,
const char* name,
uint32_t num_units,
uint32_t num_caches,
uint32_t num_requests,
const CacheSim::Config& config)
: SimObject(ctx, name)
, CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
, CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
, MemReqPort(this)
, MemRspPort(this)
, caches_(MAX(num_caches, 0x1)) {
CacheSim::Config config2(config);
if (0 == num_caches) {
num_caches = 1;
config2.bypass = true;
}
char sname[100];
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
for (uint32_t u = 0; u < num_units; ++u) {
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
for (uint32_t i = 0; i < num_requests; ++i) {
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
}
}
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
for (uint32_t i = 0; i < config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
for (uint32_t u = 0; u < num_units; ++u) {
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
}
}
snprintf(sname, 100, "%s-cache-arb", name);
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
for (uint32_t i = 0; i < num_caches; ++i) {
snprintf(sname, 100, "%s-cache%d", name, i);
caches_.at(i) = CacheSim::Create(sname, config2);
for (uint32_t j = 0; j < config.num_inputs; ++j) {
mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
}
caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i));
cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort);
}
cache_arb->ReqOut.at(0).bind(&this->MemReqPort);
this->MemRspPort.bind(&cache_arb->RspOut.at(0));
}
~CacheCluster() {}
void reset() {}
void tick() {}
CacheSim::PerfStats perf_stats() const {
CacheSim::PerfStats perf;
for (auto cache : caches_) {
perf += cache->perf_stats();
}
return perf;
}
private:
std::vector<CacheSim::Ptr> caches_;
};
}

707
sim/simx/cache_sim.cpp Normal file
View File

@@ -0,0 +1,707 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cache_sim.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t lines_per_set;
uint32_t words_per_line;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const CacheSim::Config& config) {
int32_t bank_bits = log2ceil(config.num_banks);
int32_t offset_bits = config.B - config.W;
int32_t log2_bank_size = config.C - bank_bits;
int32_t index_bits = log2_bank_size - (config.B + config.A);
assert(log2_bank_size > 0);
assert(offset_bits >= 0);
assert(index_bits >= 0);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_line = 1 << offset_bits;
this->lines_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_line);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct line_t {
uint64_t tag;
uint32_t lru_ctr;
bool valid;
bool dirty;
void clear() {
valid = false;
dirty = false;
}
};
struct set_t {
std::vector<line_t> lines;
set_t(uint32_t num_ways)
: lines(num_ways)
{}
void clear() {
for (auto& line : lines) {
line.clear();
}
}
};
struct bank_req_port_t {
uint32_t req_id;
uint64_t req_tag;
bool valid;
void clear() {
valid = false;
}
};
struct bank_req_t {
enum ReqType {
None = 0,
Fill = 1,
Replay = 2,
Core = 3
};
std::vector<bank_req_port_t> ports;
uint64_t tag;
uint32_t set_id;
uint32_t cid;
uint64_t uuid;
ReqType type;
bool write;
bank_req_t(uint32_t num_ports)
: ports(num_ports)
{}
void clear() {
for (auto& port : ports) {
port.clear();
}
type = ReqType::None;
}
};
struct mshr_entry_t {
bank_req_t bank_req;
uint32_t line_id;
mshr_entry_t(uint32_t num_ports)
: bank_req(num_ports)
{}
void clear() {
bank_req.clear();
}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t size_;
public:
MSHR(uint32_t size, uint32_t num_ports)
: entries_(size, num_ports)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
bool lookup(const bank_req_t& bank_req) {
for (auto& entry : entries_) {;
if (entry.bank_req.type != bank_req_t::None
&& entry.bank_req.set_id == bank_req.set_id
&& entry.bank_req.tag == bank_req.tag) {
return true;
}
}
return false;
}
int allocate(const bank_req_t& bank_req, uint32_t line_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.bank_req.type == bank_req_t::None) {
entry.bank_req = bank_req;
entry.line_id = line_id;
++size_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.bank_req.type == bank_req_t::Core);
// mark all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.bank_req.type == bank_req_t::Core
&& entry.bank_req.set_id == root_entry.bank_req.set_id
&& entry.bank_req.tag == root_entry.bank_req.tag) {
entry.bank_req.type = bank_req_t::Replay;
}
}
return root_entry;
}
bool pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.bank_req.type == bank_req_t::Replay) {
*out = entry.bank_req;
entry.bank_req.type = bank_req_t::None;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
entry.clear();
}
size_ = 0;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const CacheSim::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.lines_per_set)
, mshr(config.mshr_size, config.ports_per_bank)
{}
void clear() {
for (auto& set : sets) {
set.clear();
}
mshr.clear();
}
};
///////////////////////////////////////////////////////////////////////////////
class CacheSim::Impl {
private:
CacheSim* const simobject_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr bank_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
std::vector<bank_req_t> pipeline_reqs_;
uint32_t init_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(CacheSim* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
, pipeline_reqs_(config.num_banks, config.ports_per_bank)
{
char sname[100];
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
if (config_.bypass) {
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
for (uint32_t i = 0; i < config_.num_inputs; ++i) {
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
}
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
return;
}
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
if (config.num_banks > 1) {
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
}
bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0));
} else {
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate cache initialization cycles
init_cycles_ = params_.sets_per_bank * params_.lines_per_set;
}
void reset() {
if (config_.bypass)
return;
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
if (config_.bypass)
return;
// wait on cache initialization cycles
if (init_cycles_ != 0) {
--init_cycles_;
return;
}
// handle cache bypasss responses
{
auto& bypass_port = bypass_switch_->RspIn.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.front();
this->processBypassResponse(mem_rsp);
bypass_port.pop();
}
}
// initialize pipeline request
for (auto& pipeline_req : pipeline_reqs_) {
pipeline_req.clear();
}
// schedule MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// schedule memory fill
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (mem_rsp_port.empty())
continue;
auto& pipeline_req = pipeline_reqs_.at(bank_id);
if (pipeline_req.type != bank_req_t::None)
continue;
auto& mem_rsp = mem_rsp_port.front();
DT(3, simobject_->name() << "-dram-" << mem_rsp);
pipeline_req.type = bank_req_t::Fill;
pipeline_req.tag = mem_rsp.tag;
mem_rsp_port.pop();
}
// schedule core requests
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.type == AddrType::IO) {
// send bypass request
this->processBypassRequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// check MSHR capacity
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
++perf_stats_.bank_stalls;
continue;
}
// check bank conflicts
if (pipeline_req.type == bank_req_t::Core) {
// check port conflict
if (pipeline_req.write != core_req.write
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.ports.at(port_id).valid) {
++perf_stats_.bank_stalls;
continue;
}
// extend request ports
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
} else if (pipeline_req.type == bank_req_t::None) {
// schedule new request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.cid = core_req.cid;
bank_req.uuid = core_req.uuid;
bank_req.type = bank_req_t::Core;
bank_req.write = core_req.write;
pipeline_req = bank_req;
} else {
// bank in use
++perf_stats_.bank_stalls;
continue;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
DT(3, simobject_->name() << "-core-" << core_req);
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequests();
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processBypassResponse(const MemRsp& mem_rsp) {
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
void processBypassRequest(const MemReq& core_req, uint32_t req_id) {
DT(3, simobject_->name() << "-core-" << core_req);
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
void processBankRequests() {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto pipeline_req = pipeline_reqs_.at(bank_id);
switch (pipeline_req.type) {
case bank_req_t::None:
break;
case bank_req_t::Fill: {
// update cache line
auto& bank = banks_.at(bank_id);
auto& entry = bank.mshr.replay(pipeline_req.tag);
auto& set = bank.sets.at(entry.bank_req.set_id);
auto& line = set.lines.at(entry.line_id);
line.valid = true;
line.tag = entry.bank_req.tag;
--pending_fill_reqs_;
} break;
case bank_req_t::Replay: {
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} break;
case bank_req_t::Core: {
bool hit = false;
bool found_free_line = false;
uint32_t hit_line_id = 0;
uint32_t repl_line_id = 0;
uint32_t max_cnt = 0;
auto& set = bank.sets.at(pipeline_req.set_id);
// tag lookup
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
auto& line = set.lines.at(i);
if (line.valid) {
if (line.tag == pipeline_req.tag) {
line.lru_ctr = 0;
hit_line_id = i;
hit = true;
} else {
++line.lru_ctr;
}
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
} else {
found_free_line = true;
repl_line_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
auto& hit_line = set.lines.at(hit_line_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
} else {
// mark line as dirty
hit_line.dirty = true;
}
}
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_line && !config_.write_through) {
// write back dirty line
auto& repl_line = set.lines.at(repl_line_id);
if (repl_line.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++perf_stats_.evictions;
}
}
if (pipeline_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
// MSHR lookup
auto mshr_pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
// send fill request
if (!mshr_pending) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++pending_fill_reqs_;
}
}
}
} break;
}
}
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
}
};
///////////////////////////////////////////////////////////////////////////////
CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config)
: SimObject<CacheSim>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
CacheSim::~CacheSim() {
delete impl_;
}
void CacheSim::reset() {
impl_->reset();
}
void CacheSim::tick() {
impl_->tick();
}
const CacheSim::PerfStats& CacheSim::perf_stats() const {
return impl_->perf_stats();
}

View File

@@ -1,13 +1,27 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "memsim.h"
#include "mem_sim.h"
namespace vortex {
class Cache : public SimObject<Cache> {
class CacheSim : public SimObject<CacheSim> {
public:
struct Config {
bool bypass; // cache bypass
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t W; // log2 word size
@@ -45,6 +59,19 @@ public:
, mshr_stalls(0)
, mem_latency(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->read_misses += rhs.read_misses;
this->write_misses += rhs.write_misses;
this->evictions += rhs.evictions;
this->pipeline_stalls += rhs.pipeline_stalls;
this->bank_stalls += rhs.bank_stalls;
this->mshr_stalls += rhs.mshr_stalls;
this->mem_latency += rhs.mem_latency;
return *this;
}
};
std::vector<SimPort<MemReq>> CoreReqPorts;
@@ -52,8 +79,8 @@ public:
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
Cache(const SimContext& ctx, const char* name, const Config& config);
~Cache();
CacheSim(const SimContext& ctx, const char* name, const Config& config);
~CacheSim();
void reset();

219
sim/simx/cluster.cpp Normal file
View File

@@ -0,0 +1,219 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cluster.h"
using namespace vortex;
Cluster::Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch, const
DCRS &dcrs)
: SimObject(ctx, "cluster")
, mem_req_port(this)
, mem_rsp_port(this)
, cluster_id_(cluster_id)
, cores_(arch.num_cores())
, barriers_(arch.num_barriers(), 0)
, sharedmems_(arch.num_cores())
, processor_(processor)
{
auto num_cores = arch.num_cores();
char sname[100];
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L2_NUM_WAYS), // W
0, // A
XLEN, // address bits
L2_NUM_BANKS, // number of banks
1, // number of ports
5, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
l2cache_->MemReqPort.bind(&this->mem_req_port);
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
XLEN, // address bits
1, // number of banks
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
0, // victim size
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
XLEN, // address bits
DCACHE_NUM_BANKS, // number of banks
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
///////////////////////////////////////////////////////////////////////////
// create shared memory blocks
for (uint32_t i = 0; i < num_cores; ++i) {
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
}
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
uint32_t core_id = cluster_id * num_cores + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs,
sharedmems_.at(i));
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
auto smem_demux = SMemDemux::Create(sname);
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
}
}
}
Cluster::~Cluster() {
//--
}
void Cluster::reset() {
for (auto& barrier : barriers_) {
barrier.reset();
}
}
void Cluster::tick() {
//--
}
void Cluster::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
bool Cluster::running() const {
for (auto& core : cores_) {
if (core->running())
return true;
}
return false;
}
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
for (auto& core : cores_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
}
}
*exitcode = exitcode_;
return done;
}
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id);
uint32_t local_core_id = core_id % cores_.size();
barrier.set(local_core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) {
// resume all suspended cores
for (uint32_t i = 0; i < cores_.size(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
cores_.at(i)->resume();
}
}
barrier.reset();
}
}
ProcessorImpl* Cluster::processor() const {
return processor_;
}
Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();
}
return perf;
}

86
sim/simx/cluster.h Normal file
View File

@@ -0,0 +1,86 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "dcrs.h"
#include "arch.h"
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "constants.h"
namespace vortex {
class ProcessorImpl;
class Cluster : public SimObject<Cluster> {
public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
SharedMem::PerfStats sharedmem;
CacheSim::PerfStats l2cache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
this->sharedmem += rhs.sharedmem;
this->l2cache += rhs.l2cache;
return *this;
}
};
SimPort<MemReq> mem_req_port;
SimPort<MemRsp> mem_rsp_port;
Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch,
const DCRS &dcrs);
~Cluster();
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
ProcessorImpl* processor() const;
Cluster::PerfStats perf_stats() const;
private:
uint32_t cluster_id_;
std::vector<Core::Ptr> cores_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
std::vector<SharedMem::Ptr> sharedmems_;
CacheCluster::Ptr tcaches_;
CacheCluster::Ptr ocaches_;
CacheCluster::Ptr rcaches_;
ProcessorImpl* processor_;
};
} // namespace vortex

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef RAM_PAGE_SIZE
@@ -10,14 +23,4 @@
#ifndef MEMORY_BANKS
#define MEMORY_BANKS 2
#endif
namespace vortex {
enum Constants {
SMEM_BANK_OFFSET = log2ceil(sizeof(uint32_t)) + log2ceil(STACK_SIZE / sizeof(uint32_t)),
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
@@ -11,101 +24,104 @@
#include <simobject.h>
#include "debug.h"
#include "types.h"
#include "archdef.h"
#include "arch.h"
#include "decode.h"
#include "mem.h"
#include "warp.h"
#include "pipeline.h"
#include "cache.h"
#include "sharedmem.h"
#include "cache_sim.h"
#include "shared_mem.h"
#include "ibuffer.h"
#include "scoreboard.h"
#include "exeunit.h"
#include "tex_unit.h"
#include "operand.h"
#include "dispatcher.h"
#include "exe_unit.h"
#include "dcrs.h"
namespace vortex {
class Cluster;
class Core : public SimObject<Core> {
public:
struct PerfStats {
uint64_t cycles;
uint64_t instrs;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
uint64_t lsu_stalls;
uint64_t csr_stalls;
uint64_t fpu_stalls;
uint64_t gpu_stalls;
uint64_t sfu_stalls;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
uint64_t branches;
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
uint64_t tex_reads;
uint64_t tex_latency;
uint64_t ifetch_latency;
uint64_t load_latency;
PerfStats()
: instrs(0)
: cycles(0)
, instrs(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)
, lsu_stalls(0)
, csr_stalls(0)
, fpu_stalls(0)
, gpu_stalls(0)
, sfu_stalls(0)
, ifetches(0)
, loads(0)
, stores(0)
, branches(0)
, mem_reads(0)
, mem_writes(0)
, mem_latency(0)
, tex_reads(0)
, tex_latency(0)
, ifetch_latency(0)
, load_latency(0)
{}
};
SimPort<MemRsp> MemRspPort;
SimPort<MemReq> MemReqPort;
std::vector<SimPort<MemReq>> icache_req_ports;
std::vector<SimPort<MemRsp>> icache_rsp_ports;
std::vector<SimPort<MemReq>> dcache_req_ports;
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
Core(const SimContext& ctx,
uint32_t core_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs,
SharedMem::Ptr sharedmem);
Core(const SimContext& ctx, const ArchDef &arch, uint32_t id);
~Core();
void attach_ram(RAM* ram);
bool running() const;
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
void resume();
uint32_t id() const {
return id_;
return core_id_;
}
const Decoder& decoder() {
return decoder_;
}
const ArchDef& arch() const {
const Arch& arch() const {
return arch_;
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
uint32_t getIRegValue(int reg) const {
return warps_.at(0)->getIRegValue(reg);
const DCRS& dcrs() const {
return dcrs_;
}
uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);
WarpMask wspawn(uint32_t num_warps, uint32_t nextPC);
void wspawn(uint32_t num_warps, Word nextPC);
WarpMask barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
AddrType get_addr_type(uint64_t addr);
void icache_read(void* data, uint64_t addr, uint32_t size);
@@ -113,19 +129,22 @@ public:
void dcache_write(const void* data, uint64_t addr, uint32_t size);
uint32_t tex_read(uint32_t unit, uint32_t lod, uint32_t u, uint32_t v, std::vector<mem_addr_size_t>* mem_addrs);
void dcache_amo_reserve(uint64_t addr);
bool dcache_amo_check(uint64_t addr);
void trigger_ecall();
void trigger_ebreak();
bool check_exit() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
private:
void schedule();
void fetch();
void decode();
void issue();
void execute();
void commit();
@@ -133,49 +152,51 @@ private:
void cout_flush();
uint32_t id_;
const ArchDef arch_;
uint32_t core_id_;
const Arch& arch_;
const DCRS &dcrs_;
const Decoder decoder_;
MemoryUnit mmu_;
RAM smem_;
std::vector<TexUnit> tex_units_;
std::vector<std::shared_ptr<Warp>> warps_;
std::vector<WarpMask> barriers_;
std::vector<uint32_t> csrs_;
std::vector<WarpMask> barriers_;
std::vector<Byte> fcsrs_;
std::vector<IBuffer> ibuffers_;
Scoreboard scoreboard_;
std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<ExeUnit::Ptr> exe_units_;
Cache::Ptr icache_;
Cache::Ptr dcache_;
SharedMem::Ptr shared_mem_;
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
SharedMem::Ptr sharedmem_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;
HashTable<pipeline_trace_t*> pending_icache_;
std::vector<pipeline_trace_t*> committed_traces_;
WarpMask active_warps_;
WarpMask stalled_warps_;
uint32_t last_schedule_wid_;
uint64_t issued_instrs_;
uint64_t committed_instrs_;
uint32_t csr_tex_unit_;
bool ecall_;
bool ebreak_;
bool exited_;
uint64_t pending_ifetches_;
std::unordered_map<int, std::stringstream> print_bufs_;
std::vector<std::vector<CSRs>> csrs_;
PerfStats perf_stats_;
uint64_t perf_mem_pending_reads_;
Cluster* cluster_;
uint32_t commit_exe_;
friend class Warp;
friend class LsuUnit;
friend class AluUnit;
friend class CsrUnit;
friend class FpuUnit;
friend class GpuUnit;
friend class SfuUnit;
};
} // namespace vortex
} // namespace vortex

28
sim/simx/dcrs.cpp Normal file
View File

@@ -0,0 +1,28 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dcrs.h"
#include <iostream>
using namespace vortex;
void DCRS::write(uint32_t addr, uint32_t value) {
if (addr >= VX_DCR_BASE_STATE_BEGIN
&& addr < VX_DCR_BASE_STATE_END) {
base_dcrs.write(addr, value);
return;
}
std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl;
std::abort();
}

45
sim/simx/dcrs.h Normal file
View File

@@ -0,0 +1,45 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <util.h>
#include <VX_types.h>
#include <array>
namespace vortex {
class BaseDCRS {
public:
uint32_t read(uint32_t addr) const {
uint32_t state = VX_DCR_BASE_STATE(addr);
return states_.at(state);
}
void write(uint32_t addr, uint32_t value) {
uint32_t state = VX_DCR_BASE_STATE(addr);
states_.at(state) = value;
}
private:
std::array<uint32_t, VX_DCR_BASE_STATE_COUNT> states_;
};
class DCRS {
public:
void write(uint32_t addr, uint32_t value);
BaseDCRS base_dcrs;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef DEBUG_LEVEL

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <string>
#include <stdlib.h>
@@ -9,41 +22,36 @@
#include "debug.h"
#include "types.h"
#include "decode.h"
#include "archdef.h"
#include "arch.h"
#include "instr.h"
using namespace vortex;
struct InstTableEntry_t {
bool controlFlow;
InstType iType;
};
static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable = {
{Opcode::NOP, {false, InstType::N_TYPE}},
{Opcode::R_INST, {false, InstType::R_TYPE}},
{Opcode::L_INST, {false, InstType::I_TYPE}},
{Opcode::I_INST, {false, InstType::I_TYPE}},
{Opcode::S_INST, {false, InstType::S_TYPE}},
{Opcode::B_INST, {true , InstType::B_TYPE}},
{Opcode::LUI_INST, {false, InstType::U_TYPE}},
{Opcode::AUIPC_INST, {false, InstType::U_TYPE}},
{Opcode::JAL_INST, {true , InstType::J_TYPE}},
{Opcode::JALR_INST, {true , InstType::I_TYPE}},
{Opcode::SYS_INST, {true , InstType::I_TYPE}},
{Opcode::FENCE, {true , InstType::I_TYPE}},
{Opcode::FL, {false, InstType::I_TYPE}},
{Opcode::FS, {false, InstType::S_TYPE}},
{Opcode::FCI, {false, InstType::R_TYPE}},
{Opcode::FMADD, {false, InstType::R4_TYPE}},
{Opcode::FMSUB, {false, InstType::R4_TYPE}},
{Opcode::FMNMADD, {false, InstType::R4_TYPE}},
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
{Opcode::VSET, {false, InstType::V_TYPE}},
{Opcode::GPGPU, {false, InstType::R_TYPE}},
{Opcode::GPU, {false, InstType::R4_TYPE}},
{Opcode::R_INST_W, {false, InstType::R_TYPE}},
{Opcode::I_INST_W, {false, InstType::I_TYPE}},
static const std::unordered_map<Opcode, InstType> sc_instTable = {
{Opcode::R_INST, InstType::R_TYPE},
{Opcode::L_INST, InstType::I_TYPE},
{Opcode::I_INST, InstType::I_TYPE},
{Opcode::S_INST, InstType::S_TYPE},
{Opcode::B_INST, InstType::B_TYPE},
{Opcode::LUI_INST, InstType::U_TYPE},
{Opcode::AUIPC_INST, InstType::U_TYPE},
{Opcode::JAL_INST, InstType::J_TYPE},
{Opcode::JALR_INST, InstType::I_TYPE},
{Opcode::SYS_INST, InstType::I_TYPE},
{Opcode::FENCE, InstType::I_TYPE},
{Opcode::AMO, InstType::R_TYPE},
{Opcode::FL, InstType::I_TYPE},
{Opcode::FS, InstType::S_TYPE},
{Opcode::FCI, InstType::R_TYPE},
{Opcode::FMADD, InstType::R4_TYPE},
{Opcode::FMSUB, InstType::R4_TYPE},
{Opcode::FMNMADD, InstType::R4_TYPE},
{Opcode::FMNMSUB, InstType::R4_TYPE},
{Opcode::VSET, InstType::V_TYPE},
{Opcode::EXT1, InstType::R_TYPE},
{Opcode::EXT2, InstType::R4_TYPE},
{Opcode::R_INST_W, InstType::R_TYPE},
{Opcode::I_INST_W, InstType::I_TYPE},
};
enum Constants {
@@ -58,6 +66,8 @@ enum Constants {
width_i_imm = 12,
width_j_imm = 20,
width_v_imm = 11,
width_aq = 1,
width_rl = 1,
shift_opcode= 0,
shift_rd = width_opcode,
@@ -72,15 +82,15 @@ enum Constants {
shift_func6 = shift_func7 + width_vmask,
shift_vset = shift_func7 + width_func6,
mask_opcode = (1<<width_opcode)-1,
mask_reg = (1<<width_reg)-1,
mask_func2 = (1<<width_func2)-1,
mask_func3 = (1<<width_func3)-1,
mask_func6 = (1<<width_func6)-1,
mask_func7 = (1<<width_func7)-1,
mask_i_imm = (1<<width_i_imm)-1,
mask_j_imm = (1<<width_j_imm)-1,
mask_v_imm = (1<<width_v_imm)-1,
mask_opcode = (1 << width_opcode) - 1,
mask_reg = (1 << width_reg) - 1,
mask_func2 = (1 << width_func2) - 1,
mask_func3 = (1 << width_func3) - 1,
mask_func6 = (1 << width_func6) - 1,
mask_func7 = (1 << width_func7) - 1,
mask_i_imm = (1 << width_i_imm) - 1,
mask_j_imm = (1 << width_j_imm) - 1,
mask_v_imm = (1 << width_v_imm) - 1,
};
static const char* op_string(const Instr &instr) {
@@ -92,7 +102,6 @@ static const char* op_string(const Instr &instr) {
auto imm = instr.getImm();
switch (opcode) {
case Opcode::NOP: return "NOP";
case Opcode::LUI_INST: return "LUI";
case Opcode::AUIPC_INST: return "AUIPC";
case Opcode::R_INST:
@@ -116,7 +125,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SLT";
case 3: return "SLTU";
case 4: return "XOR";
case 5: return func7 ? "SRA" : "SRL";
case 5: return (func7 & 0x20) ? "SRA" : "SRL";
case 6: return "OR";
case 7: return "AND";
default:
@@ -130,7 +139,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SLTI";
case 3: return "SLTIU";
case 4: return "XORI";
case 5: return func7 ? "SRAI" : "SRLI";
case 5: return (func7 & 0x20) ? "SRAI" : "SRLI";
case 6: return "ORI";
case 7: return "ANDI";
default:
@@ -151,8 +160,8 @@ static const char* op_string(const Instr &instr) {
case Opcode::JALR_INST: return "JALR";
case Opcode::L_INST:
switch (func3) {
case 0: return "LBI";
case 1: return "LHI";
case 0: return "LB";
case 1: return "LH";
case 2: return "LW";
case 3: return "LD";
case 4: return "LBU";
@@ -192,11 +201,11 @@ static const char* op_string(const Instr &instr) {
}
case Opcode::I_INST_W:
switch (func3) {
case 0: return "ADDIW";
case 1: return "SLLIW";
case 5: return func7 ? "SRAIW" : "SRLIW";
default:
std::abort();
case 0: return "ADDIW";
case 1: return "SLLIW";
case 5: return func7 ? "SRAIW" : "SRLIW";
default:
std::abort();
}
case Opcode::SYS_INST:
switch (func3) {
@@ -222,20 +231,59 @@ static const char* op_string(const Instr &instr) {
case Opcode::FENCE: return "FENCE";
case Opcode::FL:
switch (func3) {
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
std::abort();
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
std::abort();
}
case Opcode::FS:
switch (func3) {
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
default:
std::abort();
}
case Opcode::AMO: {
auto amo_type = func7 >> 2;
switch (func3) {
case 0x2:
switch (amo_type) {
case 0x00: return "AMOADD.W";
case 0x01: return "AMOSWAP.W";
case 0x02: return "LR.W";
case 0x03: return "SC.W";
case 0x04: return "AMOXOR.W";
case 0x08: return "AMOOR.W";
case 0x0c: return "AMOAND.W";
case 0x10: return "AMOMIN.W";
case 0x14: return "AMOMAX.W";
case 0x18: return "AMOMINU.W";
case 0x1c: return "AMOMAXU.W";
default:
std::abort();
}
case 0x3:
switch (amo_type) {
case 0x00: return "AMOADD.D";
case 0x01: return "AMOSWAP.D";
case 0x02: return "LR.D";
case 0x03: return "SC.D";
case 0x04: return "AMOXOR.D";
case 0x08: return "AMOOR.D";
case 0x0c: return "AMOAND.D";
case 0x10: return "AMOMIN.D";
case 0x14: return "AMOMAX.D";
case 0x18: return "AMOMINU.D";
case 0x1c: return "AMOMAXU.D";
default:
std::abort();
}
default:
std::abort();
}
}
case Opcode::FCI:
switch (func7) {
case 0x00: return "FADD.S";
@@ -332,9 +380,9 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}
case 0x70: return func3 ? "FCLASS.S" : "FMV.X.W";
case 0x70: return func3 ? "FCLASS.S" : "FMV.X.S";
case 0x71: return func3 ? "FCLASS.D" : "FMV.X.D";
case 0x78: return "FMV.W.X";
case 0x78: return "FMV.S.X";
case 0x79: return "FMV.D.X";
default:
std::abort();
@@ -344,23 +392,27 @@ static const char* op_string(const Instr &instr) {
case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
case Opcode::VSET: return "VSET";
case Opcode::GPGPU:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PREFETCH";
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PRED";
default:
std::abort();
}
default:
std::abort();
}
case Opcode::GPU:
case Opcode::EXT2:
switch (func3) {
case 0: return "TEX";
case 1: {
switch (func2) {
case 0: return "CMOV";
case 0: return "CMOV";
default:
std::abort();
}
@@ -375,43 +427,36 @@ static const char* op_string(const Instr &instr) {
namespace vortex {
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
auto opcode = instr.getOpcode();
auto func2 = instr.getFunc2();
auto opcode = instr.getOpcode();
auto func3 = instr.getFunc3();
os << op_string(instr) << ": ";
if (opcode == S_INST
|| opcode == FS) {
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
} else
if (opcode == L_INST
|| opcode == FL) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
} else {
if (instr.getRDType() != RegType::None) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
}
uint32_t i = 0;
for (; i < instr.getNRSrc(); ++i) {
if (i) os << ", ";
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (i) os << ", ";
os << "imm=0x" << std::hex << instr.getImm();
}
if (opcode == GPU && func3 == 0) {
os << ", unit=" << std::dec << func2;
}
os << op_string(instr);
int sep = 0;
if (instr.getRDType() != RegType::None) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRDType() << std::dec << instr.getRDest();
}
for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {
if (instr.getRSType(i) == RegType::None)
continue;
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getImm();
}
if (opcode == Opcode::SYS_INST && func3 >= 5) {
// CSRs with immediate values
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getRSrc(0);
}
return os;
}
}
Decoder::Decoder(const ArchDef&) {}
Decoder::Decoder(const Arch&) {}
std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
auto instr = std::make_shared<Instr>();
@@ -434,7 +479,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
return nullptr;
}
auto iType = op_it->second.iType;
auto iType = op_it->second;
if (op == Opcode::FL || op == Opcode::FS) {
if (func3 != 0x2 && func3 != 0x3) {
iType = InstType::V_TYPE;
@@ -442,57 +487,88 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
switch (iType) {
case InstType::N_TYPE:
break;
case InstType::R_TYPE:
if (op == Opcode::FCI) {
switch (func7) {
switch (op) {
case Opcode::FCI:
switch (func7) {
case 0x2c: // FSQRT.S
case 0x2d: // FSQRT.D
instr->setDestReg(rd, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x50: // FLE.S, FLT.S, FEQ.S
case 0x51: // FLE.D, FLT.D, FEQ.D
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
case 0x60: // FCVT.W.D, FCVT.WU.D, FCVT.L.D, FCVT.LU.D
case 0x61: // FCVT.WU.S, FCVT.W.S, FCVT.L.S, FCVT.LU.S
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::None);
break;
case 0x68: // FCVT.S.W, FCVT.S.WU, FCVT.S.L, FCVT.S.LU
case 0x69: // FCVT.D.W, FCVT.D.WU, FCVT.D.L, FCVT.D.LU
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::None);
break;
case 0x70: // FCLASS.S, FMV.X.W
case 0x70: // FCLASS.S, FMV.X.S
case 0x71: // FCLASS.D, FMV.X.D
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x78: // FMV.W.X
case 0x78: // FMV.S.X
case 0x79: // FMV.D.X
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
}
} else {
break;
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
case 0: // TMC
case 3: // JOIN
instr->addSrcReg(rs1, RegType::Integer);
break;
case 1: // WSPAWN
case 4: // BAR
case 5: // PRED
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
case 2: // SPLIT
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
std::abort();
}
break;
default:
std::abort();
}
break;
default:
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
}
instr->setFunc3(func3);
instr->setFunc7(func7);
break;
case InstType::I_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
if (op == Opcode::FL) {
instr->setDestReg(rd, RegType::Float);
} else {
@@ -503,15 +579,23 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
switch (op) {
case Opcode::SYS_INST:
if (func3 != 0) {
// RV32I: CSR*
instr->setDestReg(rd, RegType::Integer);
}
// RV32I: CSR
if (func3 >= 5) {
// rs1 holds zimm
instr->setSrcReg(0, rs1, RegType::None);
}
} else {
instr->setDestReg(rd, RegType::None);
instr->setSrcReg(0, rs1, RegType::None);
}
// uint12
instr->setImm(code >> shift_rs2);
break;
case Opcode::FENCE:
// uint12
instr->setImm(code >> shift_rs2);
instr->setDestReg(rd, RegType::None);
instr->setSrcReg(0, rs1, RegType::None);
break;
case Opcode::I_INST:
case Opcode::I_INST_W:
@@ -538,11 +622,11 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
} break;
case InstType::S_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
if (op == Opcode::FS) {
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
} else {
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
}
instr->setFunc3(func3);
auto imm = (func7 << width_reg) | rd;
@@ -550,8 +634,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
} break;
case InstType::B_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->setFunc3(func3);
auto bit_11 = rd & 0x1;
auto bits_4_1 = rd >> 1;
@@ -581,8 +665,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
case InstType::V_TYPE:
switch (op) {
case Opcode::VSET: {
instr->setDestVReg(rd);
instr->setSrcVReg(rs1);
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs1, RegType::Vector);
instr->setFunc3(func3);
if (func3 == 7) {
instr->setImm(!(code >> shift_vset));
@@ -593,20 +677,20 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
instr->setVediv((immed >> 4) & 0x3);
instr->setVsew((immed >> 2) & 0x3);
} else {
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
}
} else {
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask((code >> shift_func7) & 0x1);
instr->setFunc6(func6);
}
} break;
case Opcode::FL:
instr->setDestVReg(rd);
instr->setSrcVReg(rs1);
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs1, RegType::Vector);
instr->setVlsWidth(func3);
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask(code >> shift_func7);
instr->setVmop((code >> shift_vmop) & mask_func3);
instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -614,9 +698,9 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
case Opcode::FS:
instr->setVs3(rd);
instr->setSrcVReg(rs1);
instr->addSrcReg(rs1, RegType::Vector);
instr->setVlsWidth(func3);
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask(code >> shift_func7);
instr->setVmop((code >> shift_vmop) & mask_func3);
instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -627,16 +711,28 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
break;
case R4_TYPE:
if (op == Opcode::GPU) {
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->setSrcReg(rs3, RegType::Integer);
if (op == Opcode::EXT2) {
switch (func3) {
case 1:
switch (func2) {
case 0: // CMOV
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs3, RegType::Integer);
break;
default:
std::abort();
}
break;
default:
std::abort();
}
} else {
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->setSrcReg(rs3, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs3, RegType::Float);
}
instr->setFunc2(func2);
instr->setFunc3(func3);

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
@@ -5,12 +18,12 @@
namespace vortex {
class ArchDef;
class Arch;
class Instr;
class Decoder {
public:
Decoder(const ArchDef &);
Decoder(const Arch &);
std::shared_ptr<Instr> decode(uint32_t code) const;
};

141
sim/simx/dispatcher.h Normal file
View File

@@ -0,0 +1,141 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Dispatcher : public SimObject<Dispatcher> {
public:
std::vector<SimPort<pipeline_trace_t*>> Outputs;
Dispatcher(const SimContext& ctx, const Arch& arch, uint32_t buf_size, uint32_t block_size, uint32_t num_lanes)
: SimObject<Dispatcher>(ctx, "Dispatcher")
, Outputs(ISSUE_WIDTH, this)
, Inputs_(ISSUE_WIDTH, this)
, arch_(arch)
, queues_(ISSUE_WIDTH, std::queue<pipeline_trace_t*>())
, buf_size_(buf_size)
, block_size_(block_size)
, num_lanes_(num_lanes)
, batch_count_(ISSUE_WIDTH / block_size)
, pid_count_(arch.num_threads() / num_lanes)
, batch_idx_(0)
, start_p_(block_size, 0)
{}
virtual ~Dispatcher() {}
virtual void reset() {
batch_idx_ = 0;
for (uint32_t b = 0; b < block_size_; ++b) {
start_p_.at(b) = 0;
}
}
virtual void tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& queue = queues_.at(i);
if (queue.empty())
continue;
auto trace = queue.front();
Inputs_.at(i).send(trace, 1);
queue.pop();
}
uint32_t block_sent = 0;
for (uint32_t b = 0; b < block_size_; ++b) {
uint32_t i = batch_idx_ * block_size_ + b;
auto& input = Inputs_.at(i);
if (input.empty()) {
++block_sent;
continue;
}
auto& output = Outputs.at(i);
auto trace = input.front();
if (pid_count_ != 1) {
auto start_p = start_p_.at(b);
if (start_p == -1) {
++block_sent;
continue;
}
int start(-1), end(-1);
for (uint32_t j = start_p * num_lanes_, n = arch_.num_threads(); j < n; ++j) {
if (!trace->tmask.test(j))
continue;
if (start == -1)
start = j;
end = j;
}
start /= num_lanes_;
end /= num_lanes_;
auto new_trace = new pipeline_trace_t(*trace);
new_trace->tmask.reset();
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
new_trace->tmask[j] = trace->tmask[j];
}
new_trace->pid = start;
new_trace->sop = (start_p == 0);
if (start == end) {
new_trace->eop = 1;
start_p_.at(b) = -1;
input.pop();
++block_sent;
delete trace;
} else {
new_trace->eop = 0;
start_p_.at(b) = start + 1;
}
output.send(new_trace, 1);
DT(3, "pipeline-dispatch: " << *new_trace);
} else {
trace->pid = 0;
input.pop();
output.send(trace, 1);
DT(3, "pipeline-dispatch: " << *trace);
++block_sent;
}
}
if (block_sent == block_size_) {
batch_idx_ = (batch_idx_ + 1) % batch_count_;
for (uint32_t b = 0; b < block_size_; ++b) {
start_p_.at(b) = 0;
}
}
};
bool push(uint32_t issue_index, pipeline_trace_t* trace) {
auto& queue = queues_.at(issue_index);
if (queue.size() >= buf_size_)
return false;
queue.push(trace);
return true;
}
private:
std::vector<SimPort<pipeline_trace_t*>> Inputs_;
const Arch& arch_;
std::vector<std::queue<pipeline_trace_t*>> queues_;
uint32_t buf_size_;
uint32_t block_size_;
uint32_t num_lanes_;
uint32_t batch_count_;
uint32_t pid_count_;
uint32_t batch_idx_;
std::vector<int> start_p_;
};
}

329
sim/simx/exe_unit.cpp Normal file
View File

@@ -0,0 +1,329 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "exe_unit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"
#include "cache_sim.h"
using namespace vortex;
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->alu_type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::IMUL:
output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->fpu_type) {
case FpuType::FNCP:
output.send(trace, 2);
break;
case FpuType::FMA:
output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
auto time = input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, pending_rd_reqs_(LSUQ_SIZE)
, num_lanes_(NUM_LSU_LANES)
, pending_loads_(0)
, fence_lock_(false)
, input_idx_(0)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
pending_loads_ = 0;
fence_lock_ = false;
}
void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
// handle dcache response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type
<< ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
--pending_loads_;
}
// handle shared memory response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
--pending_loads_;
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
int iw = fence_state_->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
auto t0 = trace->pid * num_lanes_;
if (trace->lsu_type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break;
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->log_once(true)) {
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
}
break;
} else {
trace->log_once(false);
}
bool is_write = (trace->lsu_type == LsuType::STORE);
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(t0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t addr_count;
if (is_dup) {
addr_count = 1;
} else {
addr_count = trace->tmask.count();
}
auto tag = pending_rd_reqs_.allocate({trace, addr_count});
for (uint32_t t = 0; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto& dcache_req_port = core_->dcache_req_ports.at(t);
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = core_->get_addr_type(mem_addr.addr);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
++pending_loads_;
++core_->perf_stats_.loads;
if (is_dup)
break;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
output.send(trace, 1);
++core_->perf_stats_.stores;
}
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break; // single block
}
++input_idx_;
}
///////////////////////////////////////////////////////////////////////////////
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "SFU")
, input_idx_(0)
{}
void SfuUnit::tick() {
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto sfu_type = trace->sfu_type;
bool release_warp = trace->fetch_stall;
switch (sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::PRED:
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC:
output.send(trace, 1);
break;
case SfuType::BAR: {
output.send(trace, 1);
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
if (trace->eop) {
core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
}
release_warp = false;
} break;
case SfuType::CMOV:
output.send(trace, 3);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
auto stalls = (SimPlatform::instance().cycles() - time);
core_->perf_stats_.sfu_stalls += stalls;
break; // single block
}
++input_idx_;
}

View File

@@ -1,8 +1,21 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "pipeline.h"
#include "cache.h"
#include "cache_sim.h"
namespace vortex {
@@ -10,13 +23,13 @@ class Core;
class ExeUnit : public SimObject<ExeUnit> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
std::vector<SimPort<pipeline_trace_t*>> Inputs;
std::vector<SimPort<pipeline_trace_t*>> Outputs;
ExeUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<ExeUnit>(ctx, name)
, Input(this)
, Output(this)
, Inputs(ISSUE_WIDTH, this)
, Outputs(ISSUE_WIDTH, this)
, core_(core)
{}
@@ -32,32 +45,6 @@ protected:
///////////////////////////////////////////////////////////////////////////////
class NopUnit : public ExeUnit {
public:
NopUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class LsuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
pipeline_trace_t* fence_state_;
bool fence_lock_;
public:
LsuUnit(const SimContext& ctx, Core*);
void reset();
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class AluUnit : public ExeUnit {
public:
AluUnit(const SimContext& ctx, Core*);
@@ -67,15 +54,6 @@ public:
///////////////////////////////////////////////////////////////////////////////
class CsrUnit : public ExeUnit {
public:
CsrUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class FpuUnit : public ExeUnit {
public:
FpuUnit(const SimContext& ctx, Core*);
@@ -85,19 +63,37 @@ public:
///////////////////////////////////////////////////////////////////////////////
class GpuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
bool processTexRequest(pipeline_trace_t* trace);
class LsuUnit : public ExeUnit {
public:
GpuUnit(const SimContext& ctx, Core*);
LsuUnit(const SimContext& ctx, Core*);
void reset();
void tick();
private:
struct pending_req_t {
pipeline_trace_t* trace;
uint32_t count;
};
HashTable<pending_req_t> pending_rd_reqs_;
uint32_t num_lanes_;
pipeline_trace_t* fence_state_;
uint64_t pending_loads_;
bool fence_lock_;
uint32_t input_idx_;
};
///////////////////////////////////////////////////////////////////////////////
class SfuUnit : public ExeUnit {
public:
SfuUnit(const SimContext& ctx, Core*);
void tick();
private:
uint32_t input_idx_;
};
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,383 +0,0 @@
#include "exeunit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"
using namespace vortex;
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
void NopUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
Input.pop();
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, num_threads_(core->arch().num_threads())
, pending_rd_reqs_(LSUQ_SIZE)
, fence_lock_(false)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
fence_lock_ = false;
}
void LsuUnit::tick() {
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
// handle shared memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
Output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
if (Input.empty())
return;
auto trace = Input.front();
if (trace->lsu.type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
return;
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->suspend()) {
DT(3, "*** lsu-queue-stall: " << *trace);
}
return;
} else {
trace->resume();
}
bool is_write = (trace->lsu.type == LsuType::STORE);
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t valid_addrs = 0;
if (is_dup) {
valid_addrs = 1;
} else {
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
}
auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
auto mem_addr = trace->mem_addrs.at(t).at(0);
auto type = get_addr_type(mem_addr.addr, mem_addr.size);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.non_cacheable = (type == AddrType::IO);
mem_req.tag = tag;
mem_req.core_id = trace->cid;
mem_req.uuid = trace->uuid;
if (type == AddrType::Shared) {
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
} else {
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
}
if (is_dup)
break;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
Output.send(trace, 1);
}
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
switch (trace->alu.type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::CMOV:
Output.send(trace, 1);
break;
case AluType::IMUL:
Output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
Output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
void CsrUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
auto time = Input.pop();
core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
DT(3, "pipeline-execute: op=CSR, " << *trace);
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
switch (trace->fpu.type) {
case FpuType::FNCP:
Output.send(trace, 2);
break;
case FpuType::FMA:
Output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
Output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
Output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
Output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "GPU")
, num_threads_(core->arch().num_threads())
, pending_tex_reqs_(TEXQ_SIZE)
{}
void GpuUnit::reset() {
pending_tex_reqs_.clear();
}
void GpuUnit::tick() {
#ifdef EXT_TEX_ENABLE
// handle memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_tex_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
#endif
// check input queue
if (Input.empty())
return;
auto trace = Input.front();
bool issued = false;
switch (trace->gpu.type) {
case GpuType::TMC:
Output.send(trace, 1);
core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
issued = true;
break;
case GpuType::WSPAWN:
Output.send(trace, 1);
core_->active_warps_ = trace->gpu.active_warps;
issued = true;
break;
case GpuType::SPLIT:
case GpuType::JOIN:
Output.send(trace, 1);
issued = true;
break;
case GpuType::BAR:
Output.send(trace, 1);
if (trace->gpu.active_warps != 0)
core_->active_warps_ |= trace->gpu.active_warps;
else
core_->active_warps_.reset(trace->wid);
issued = true;
break;
case GpuType::TEX:
if (this->processTexRequest(trace))
issued = true;
break;
default:
std::abort();
}
if (issued) {
DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {
// check pending queue capacity
if (pending_tex_reqs_.full()) {
if (!trace->suspend()) {
DT(3, "*** tex-queue-stall: " << *trace);
}
return false;
} else {
trace->resume();
}
// send memory request
uint32_t valid_addrs = 0;
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
for (auto& mem_addr : trace->mem_addrs.at(t)) {
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = (trace->lsu.type == LsuType::STORE);
mem_req.tag = tag;
mem_req.core_id = core_->id();
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 3);
DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", tid=" << t << ", "<< trace);
++ core_->perf_stats_.tex_reads;
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
}
}
return true;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
@@ -6,10 +19,6 @@
namespace vortex {
class IBuffer {
private:
std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_;
public:
IBuffer(uint32_t size)
: capacity_(size)
@@ -39,6 +48,10 @@ public:
std::queue<pipeline_trace_t*> empty;
std::swap(entries_, empty );
}
private:
std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "types.h"
@@ -7,7 +20,7 @@ namespace vortex {
class Warp;
enum Opcode {
NOP = 0,
NONE = 0,
R_INST = 0x33,
L_INST = 0x3,
I_INST = 0x13,
@@ -19,6 +32,7 @@ enum Opcode {
JALR_INST = 0x67,
SYS_INST = 0x73,
FENCE = 0x0f,
AMO = 0x2f,
// F Extension
FL = 0x7,
FS = 0x27,
@@ -26,19 +40,20 @@ enum Opcode {
FMADD = 0x43,
FMSUB = 0x47,
FMNMSUB = 0x4b,
FMNMADD = 0x4f,
// Vector Extension
VSET = 0x57,
// GPGPU Extension
GPGPU = 0x6b,
GPU = 0x5b,
// RV64 Standard Extensions
FMNMADD = 0x4f,
// RV64 Standard Extension
R_INST_W = 0x3b,
I_INST_W = 0x1b,
// Vector Extension
VSET = 0x57,
// Custom Extensions
EXT1 = 0x0b,
EXT2 = 0x2b,
EXT3 = 0x5b,
EXT4 = 0x7b
};
enum InstType {
N_TYPE,
enum InstType {
R_TYPE,
I_TYPE,
S_TYPE,
@@ -52,25 +67,45 @@ enum InstType {
class Instr {
public:
Instr()
: opcode_(Opcode::NOP)
: opcode_(Opcode::NONE)
, num_rsrcs_(0)
, has_imm_(false)
, rdest_type_(RegType::None)
, imm_(0)
, rdest_(0)
, func2_(0)
, func3_(0)
, func6_(0)
, func7_(0) {
, func7_(0)
, vmask_(0)
, vlsWidth_(0)
, vMop_(0)
, vNf_(0)
, vs3_(0)
, vlmul_(0)
, vsew_(0)
, vediv_(0) {
for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
rsrc_type_[i] = RegType::None;
rsrc_[i] = 0;
}
}
void setOpcode(Opcode opcode) { opcode_ = opcode; }
void setDestReg(uint32_t destReg, RegType type) { rdest_type_ = type; rdest_ = destReg; }
void setSrcReg(uint32_t srcReg, RegType type) { rsrc_type_[num_rsrcs_] = type; rsrc_[num_rsrcs_++] = srcReg; }
void setDestVReg(uint32_t destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
void setSrcVReg(uint32_t srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
void setDestReg(uint32_t destReg, RegType type) {
rdest_type_ = type;
rdest_ = destReg;
}
void addSrcReg(uint32_t srcReg, RegType type) {
rsrc_type_[num_rsrcs_] = type;
rsrc_[num_rsrcs_] = srcReg;
++num_rsrcs_;
}
void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) {
rsrc_type_[index] = type;
rsrc_[index] = srcReg;
num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1);
}
void setFunc2(uint32_t func2) { func2_ = func2; }
void setFunc3(uint32_t func3) { func3_ = func3; }
void setFunc7(uint32_t func7) { func7_ = func7; }
@@ -85,17 +120,17 @@ public:
void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; }
void setFunc6(uint32_t func6) { func6_ = func6; }
Opcode getOpcode() const { return opcode_; }
Opcode getOpcode() const { return opcode_; }
uint32_t getFunc2() const { return func2_; }
uint32_t getFunc3() const { return func3_; }
uint32_t getFunc6() const { return func6_; }
uint32_t getFunc7() const { return func7_; }
uint32_t getNRSrc() const { return num_rsrcs_; }
uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
uint32_t getRDest() const { return rdest_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
uint32_t getImm() const { return imm_; }
uint32_t getVlsWidth() const { return vlsWidth_; }
uint32_t getVmop() const { return vMop_; }

View File

@@ -1,98 +1,132 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <fstream>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include "processor.h"
#include "archdef.h"
#include "mem.h"
#include "constants.h"
#include <util.h>
#include "args.h"
#include "core.h"
using namespace vortex;
static void show_usage() {
std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-r: riscv-test] [-s: stats] [-h: help] <program>" << std::endl;
}
uint32_t num_threads = NUM_THREADS;
uint32_t num_warps = NUM_WARPS;
uint32_t num_cores = NUM_CORES;
uint32_t num_clusters = NUM_CLUSTERS;
bool showStats = false;;
bool riscv_test = false;
const char* program = nullptr;
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
switch (c) {
case 't':
num_threads = atoi(optarg);
break;
case 'w':
num_warps = atoi(optarg);
break;
case 'c':
num_cores = atoi(optarg);
break;
case 'g':
num_clusters = atoi(optarg);
break;
case 'r':
riscv_test = true;
break;
case 's':
showStats = true;
break;
case 'h':
case '?':
show_usage();
exit(0);
break;
default:
show_usage();
exit(-1);
}
}
if (optind < argc) {
program = argv[optind];
std::cout << "Running " << program << "..." << std::endl;
} else {
show_usage();
exit(-1);
}
}
int main(int argc, char **argv) {
int exitcode = 0;
std::string imgFileName;
int num_cores(NUM_CORES * NUM_CLUSTERS);
int num_warps(NUM_WARPS);
int num_threads(NUM_THREADS);
bool showHelp(false);
bool showStats(false);
bool riscv_test(false);
parse_args(argc, argv);
// parse the command line arguments
CommandLineArgFlag fh("-h", "--help", "show command line options", showHelp);
CommandLineArgSetter<std::string> fi("-i", "--image", "program binary", imgFileName);
CommandLineArgSetter<int> fc("-c", "--cores", "number of cores", num_cores);
CommandLineArgSetter<int> fw("-w", "--warps", "number of warps", num_warps);
CommandLineArgSetter<int> ft("-t", "--threads", "number of threads", num_threads);
CommandLineArgFlag fr("-r", "--riscv", "enable riscv tests", riscv_test);
CommandLineArgFlag fs("-s", "--stats", "show stats", showStats);
CommandLineArg::readArgs(argc - 1, argv + 1);
if (showHelp || imgFileName.empty()) {
std::cout << "Vortex emulator command line arguments:\n"
" -i, --image <filename> Program RAM image\n"
" -c, --cores <num> Number of cores\n"
" -w, --warps <num> Number of warps\n"
" -t, --threads <num> Number of threads\n"
" -r, --riscv riscv test\n"
" -s, --stats Print stats on exit.\n";
return 0;
}
std::cout << "Running " << imgFileName << "..." << std::endl;
{
// create processor configuation
ArchDef arch(num_cores, num_warps, num_threads);
Arch arch(num_threads, num_warps, num_cores, num_clusters);
// create memory module
RAM ram(RAM_PAGE_SIZE);
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// setup base DCRs
const uint64_t startup_addr(STARTUP_ADDR);
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
#if (XLEN == 64)
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
#endif
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
// load program
{
std::string program_ext(fileExtension(imgFileName.c_str()));
{
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {
ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
ram.loadBinImage(program, startup_addr);
} else if (program_ext == "hex") {
ram.loadHexImage(imgFileName.c_str());
ram.loadHexImage(program);
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
}
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// run simulation
exitcode = processor.run();
exitcode = processor.run(riscv_test);
}
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed." << std::endl;
exitcode = 0;
} else {
std::cout << "Failed." << std::endl;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}

View File

@@ -1,4 +1,17 @@
#include "memsim.h"
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mem_sim.h"
#include <vector>
#include <queue>
#include <stdlib.h>
@@ -83,7 +96,7 @@ public:
mem_req.addr,
mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
mem_req.core_id
mem_req.cid
);
if (!dram_->send(dram_req))

View File

@@ -1,8 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "types.h"
#include <vector>
namespace vortex {

61
sim/simx/operand.h Normal file
View File

@@ -0,0 +1,61 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Operand : public SimObject<Operand> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
Operand(const SimContext& ctx)
: SimObject<Operand>(ctx, "Operand")
, Input(this)
, Output(this)
{}
virtual ~Operand() {}
virtual void reset() {}
virtual void tick() {
if (Input.empty())
return;
auto trace = Input.front();
int delay = 1;
for (int i = 0; i < MAX_NUM_REGS; ++i) {
bool is_iregs = trace->used_iregs.test(i);
bool is_fregs = trace->used_fregs.test(i);
bool is_vregs = trace->used_vregs.test(i);
if (is_iregs || is_fregs || is_vregs) {
if (is_iregs && i == 0)
continue;
++delay;
}
}
Output.send(trace, delay);
DT(3, "pipeline-operands: " << *trace);
Input.pop();
};
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
@@ -5,14 +18,38 @@
#include <iostream>
#include <util.h>
#include "types.h"
#include "archdef.h"
#include "arch.h"
#include "debug.h"
namespace vortex {
class ITraceData {
public:
using Ptr = std::shared_ptr<ITraceData>;
ITraceData() {}
virtual ~ITraceData() {}
};
struct LsuTraceData : public ITraceData {
using Ptr = std::shared_ptr<LsuTraceData>;
std::vector<mem_addr_size_t> mem_addrs;
LsuTraceData(uint32_t num_threads) : mem_addrs(num_threads) {}
};
struct SFUTraceData : public ITraceData {
using Ptr = std::shared_ptr<SFUTraceData>;
struct {
uint32_t id;
uint32_t count;
} bar;
SFUTraceData(uint32_t bar_id, uint32_t bar_count) : bar{bar_id, bar_count} {}
};
struct pipeline_trace_t {
public:
//--
uint64_t uuid;
const uint64_t uuid;
const Arch& arch;
//--
uint32_t cid;
@@ -21,12 +58,9 @@ struct pipeline_trace_t {
Word PC;
//--
bool fetch_stall;
//--
bool wb;
RegType rdest_type;
uint32_t rdest;
RegType rdest_type;
bool wb;
//--
RegMask used_iregs;
@@ -36,73 +70,104 @@ struct pipeline_trace_t {
//-
ExeType exe_type;
//--
std::vector<std::vector<mem_addr_size_t>> mem_addrs;
//--
union {
struct {
LsuType type;
} lsu;
struct {
AluType type;
} alu;
struct {
FpuType type;
} fpu;
struct {
GpuType type;
WarpMask active_warps;
} gpu;
uint32_t unit_type;
LsuType lsu_type;
AluType alu_type;
FpuType fpu_type;
SfuType sfu_type;
};
bool stalled;
ITraceData::Ptr data;
pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
uuid = uuid_;
cid = 0;
wid = 0;
tmask.reset();
PC = 0;
fetch_stall = false;
wb = false;
rdest = 0;
rdest_type = RegType::None;
used_iregs.reset();
used_fregs.reset();
used_vregs.reset();
exe_type = ExeType::NOP;
mem_addrs.resize(arch.num_threads());
stalled = false;
}
int pid;
bool sop;
bool eop;
bool suspend() {
bool old = stalled;
stalled = true;
bool fetch_stall;
pipeline_trace_t(uint64_t uuid, const Arch& arch)
: uuid(uuid)
, arch(arch)
, cid(0)
, wid(0)
, tmask(0)
, PC(0)
, rdest(0)
, rdest_type(RegType::None)
, wb(false)
, used_iregs(0)
, used_fregs(0)
, used_vregs(0)
, exe_type(ExeType::ALU)
, unit_type(0)
, data(nullptr)
, pid(-1)
, sop(true)
, eop(true)
, fetch_stall(false)
, log_once_(false)
{}
pipeline_trace_t(const pipeline_trace_t& rhs)
: uuid(rhs.uuid)
, arch(rhs.arch)
, cid(rhs.cid)
, wid(rhs.wid)
, tmask(rhs.tmask)
, PC(rhs.PC)
, rdest(rhs.rdest)
, rdest_type(rhs.rdest_type)
, wb(rhs.wb)
, used_iregs(rhs.used_iregs)
, used_fregs(rhs.used_fregs)
, used_vregs(rhs.used_vregs)
, exe_type(rhs.exe_type)
, unit_type(rhs.unit_type)
, data(rhs.data)
, pid(rhs.pid)
, sop(rhs.sop)
, eop(rhs.eop)
, fetch_stall(rhs.fetch_stall)
, log_once_(false)
{}
~pipeline_trace_t() {}
bool log_once(bool enable) {
bool old = log_once_;
log_once_ = enable;
return old;
}
void resume() {
stalled = false;
}
private:
bool log_once_;
};
inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
os << "cid=" << state.cid;
os << ", wid=" << state.wid;
os << ", tmask=";
for (uint32_t i = 0, n = state.arch.num_threads(); i < n; ++i) {
os << state.tmask.test(i);
}
os << ", PC=0x" << std::hex << state.PC;
os << ", wb=" << state.wb;
if (state.wb) {
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
}
os << ", ex=" << state.exe_type;
if (state.pid != -1) {
os << ", pid=" << state.pid;
os << ", sop=" << state.sop;
os << ", eop=" << state.eop;
}
os << " (#" << std::dec << state.uuid << ")";
return os;
}
class PipelineLatch {
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
public:
PipelineLatch(const char* name = nullptr)
: name_(name)
@@ -132,6 +197,10 @@ public:
std::queue<pipeline_trace_t*> empty;
std::swap(queue_, empty );
}
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
};
}

View File

@@ -1,168 +1,141 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "processor.h"
#include "core.h"
#include "constants.h"
#include "processor_impl.h"
using namespace vortex;
class Processor::Impl {
private:
std::vector<Core::Ptr> cores_;
std::vector<Cache::Ptr> l2caches_;
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
Cache::Ptr l3cache_;
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
ProcessorImpl::ProcessorImpl(const Arch& arch)
: arch_(arch)
, clusters_(arch.num_clusters())
{
SimPlatform::instance().initialize();
public:
Impl(const ArchDef& arch)
: cores_(arch.num_cores())
, l2caches_(NUM_CLUSTERS)
, l2_mem_switches_(NUM_CLUSTERS)
{
SimPlatform::instance().initialize();
// create memory simulator
memsim_ = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
uint32_t(arch.num_cores()) * arch.num_clusters()
});
uint32_t num_cores = arch.num_cores();
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
cores_.at(i) = Core::Create(arch, i);
// create L3 cache
l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
!L3_ENABLED,
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L3_NUM_WAYS), // W
0, // A
XLEN, // address bits
L3_NUM_BANKS, // number of banks
1, // number of ports
uint8_t(arch.num_clusters()), // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
// connect L3 memory ports
l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
memsim_->MemRspPort.bind(&l3cache_->MemRspPort);
// setup memory simulator
auto memsim = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
arch.num_cores()
});
std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", Cache::Config{
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
}
} else if (NUM_CLUSTERS > 1) {
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
}
}
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster);
std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", Cache::Config{
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
(uint8_t)cores_per_cluster, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
l2cache->MemReqPort.bind(mem_req_ports.at(i));
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
}
} else {
auto& l2_mem_switch = l2_mem_switches_.at(i);
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
}
}
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
auto& core = cores_.at((i * cores_per_cluster) + j);
core->MemReqPort.bind(cluster_mem_req_ports.at(j));
cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
}
}
// create clusters
for (uint32_t i = 0; i < arch.num_clusters(); ++i) {
clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_);
// connect L3 core ports
clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i));
l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port);
}
~Impl() {
SimPlatform::instance().finalize();
}
// set up memory perf recording
memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
__unused (cycle);
perf_mem_reads_ += !req.write;
perf_mem_writes_ += req.write;
perf_mem_pending_reads_ += !req.write;
});
memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
__unused (cycle);
--perf_mem_pending_reads_;
});
void attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
this->reset();
}
int run() {
SimPlatform::instance().reset();
bool running;
int exitcode = 0;
do {
SimPlatform::instance().tick();
running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
}
if (core->check_exit()) {
exitcode = core->getIRegValue(3);
running = false;
break;
ProcessorImpl::~ProcessorImpl() {
SimPlatform::instance().finalize();
}
void ProcessorImpl::attach_ram(RAM* ram) {
for (auto cluster : clusters_) {
cluster->attach_ram(ram);
}
}
int ProcessorImpl::run(bool riscv_test) {
SimPlatform::instance().reset();
this->reset();
bool done;
Word exitcode = 0;
do {
SimPlatform::instance().tick();
done = true;
for (auto cluster : clusters_) {
if (cluster->running()) {
Word ec;
if (cluster->check_exit(&ec, riscv_test)) {
exitcode |= ec;
} else {
done = false;
}
}
} while (running);
}
perf_mem_latency_ += perf_mem_pending_reads_;
} while (!done);
return exitcode;
}
};
return exitcode;
}
void ProcessorImpl::reset() {
perf_mem_reads_ = 0;
perf_mem_writes_ = 0;
perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0;
}
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
dcrs_.write(addr, value);
}
ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
ProcessorImpl::PerfStats perf;
perf.mem_reads = perf_mem_reads_;
perf.mem_writes = perf_mem_writes_;
perf.mem_latency = perf_mem_latency_;
perf.l3cache = l3cache_->perf_stats();
for (auto cluster : clusters_) {
perf.clusters += cluster->perf_stats();
}
return perf;
}
///////////////////////////////////////////////////////////////////////////////
Processor::Processor(const ArchDef& arch)
: impl_(new Impl(arch))
Processor::Processor(const Arch& arch)
: impl_(new ProcessorImpl(arch))
{}
Processor::~Processor() {
@@ -173,6 +146,10 @@ void Processor::attach_ram(RAM* mem) {
impl_->attach_ram(mem);
}
int Processor::run() {
return impl_->run();
int Processor::run(bool riscv_test) {
return impl_->run(riscv_test);
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {
return impl_->write_dcr(addr, value);
}

View File

@@ -1,22 +1,39 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class ArchDef;
class Arch;
class RAM;
class ProcessorImpl;
class Processor {
public:
Processor(const ArchDef& arch);
Processor(const Arch& arch);
~Processor();
void attach_ram(RAM* mem);
int run();
int run(bool riscv_test);
void write_dcr(uint32_t addr, uint32_t value);
private:
class Impl;
Impl* impl_;
ProcessorImpl* impl_;
};
}
}

66
sim/simx/processor_impl.h Normal file
View File

@@ -0,0 +1,66 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "mem_sim.h"
#include "cache_sim.h"
#include "constants.h"
#include "dcrs.h"
#include "cluster.h"
namespace vortex {
class ProcessorImpl {
public:
struct PerfStats {
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
CacheSim::PerfStats l3cache;
Cluster::PerfStats clusters;
PerfStats()
: mem_reads(0)
, mem_writes(0)
, mem_latency(0)
{}
};
ProcessorImpl(const Arch& arch);
~ProcessorImpl();
void attach_ram(RAM* mem);
int run(bool riscv_test);
void write_dcr(uint32_t addr, uint32_t value);
ProcessorImpl::PerfStats perf_stats() const;
private:
void reset();
const Arch& arch_;
std::vector<std::shared_ptr<Cluster>> clusters_;
DCRS dcrs_;
MemSim::Ptr memsim_;
CacheSim::Ptr l3cache_;
uint64_t perf_mem_reads_;
uint64_t perf_mem_writes_;
uint64_t perf_mem_latency_;
uint64_t perf_mem_pending_reads_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
@@ -6,20 +19,15 @@
namespace vortex {
class Scoreboard {
private:
public:
struct reg_use_t {
RegType type;
uint32_t reg;
uint64_t owner;
};
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
public:
Scoreboard(const ArchDef &arch)
Scoreboard(const Arch &arch)
: in_use_iregs_(arch.num_warps())
, in_use_fregs_(arch.num_warps())
, in_use_vregs_(arch.num_warps())
@@ -84,8 +92,7 @@ public:
}
void reserve(pipeline_trace_t* state) {
if (!state->wb)
return;
assert(state->wb);
switch (state->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).set(state->rdest);
@@ -105,8 +112,7 @@ public:
}
void release(pipeline_trace_t* state) {
if (!state->wb)
return;
assert(state->wb);
switch (state->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).reset(state->rdest);
@@ -123,6 +129,13 @@ public:
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
owners_.erase(tag);
}
private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
};
}

138
sim/simx/shared_mem.cpp Normal file
View File

@@ -0,0 +1,138 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "shared_mem.h"
#include "core.h"
#include <bitmanip.h>
#include <vector>
#include "types.h"
using namespace vortex;
class SharedMem::Impl {
protected:
SharedMem* simobject_;
Config config_;
RAM ram_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
uint64_t to_local_addr(uint64_t addr) {
uint32_t total_lines = config_.capacity / config_.line_size;
uint32_t line_bits = log2ceil(total_lines);
uint32_t offset = bit_getw(addr, 0, line_bits-1);
return offset;
}
public:
Impl(SharedMem* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, ram_(config.capacity, config.capacity)
, bank_sel_addr_start_(0)
, bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
{}
virtual ~Impl() {}
void reset() {
perf_stats_ = PerfStats();
}
void read(void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
ram_.read(data, s_addr, size);
}
void write(const void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
ram_.write(data, s_addr, size);
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = simobject_->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = 0;
if (bank_sel_addr_start_ <= bank_sel_addr_end_) {
bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
}
// bank conflict check
if (in_used_banks.at(bank_id)) {
++perf_stats_.bank_stalls;
continue;
}
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.cid};
simobject_->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
};
///////////////////////////////////////////////////////////////////////////////
SharedMem::SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, impl_(new Impl(this, config))
{}
SharedMem::~SharedMem() {
delete impl_;
}
void SharedMem::reset() {
impl_->reset();
}
void SharedMem::read(void* data, uint64_t addr, uint32_t size) {
impl_->read(data, addr, size);
}
void SharedMem::write(const void* data, uint64_t addr, uint32_t size) {
impl_->write(data, addr, size);
}
void SharedMem::tick() {
impl_->tick();
}
const SharedMem::PerfStats& SharedMem::perf_stats() const {
return impl_->perf_stats();
}

72
sim/simx/shared_mem.h Normal file
View File

@@ -0,0 +1,72 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "types.h"
namespace vortex {
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t capacity;
uint32_t line_size;
uint32_t num_reqs;
uint32_t num_banks;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->bank_stalls += rhs.bank_stalls;
return *this;
}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config);
virtual ~SharedMem();
void reset();
void read(void* data, uint64_t addr, uint32_t size);
void write(const void* data, uint64_t addr, uint32_t size);
void tick();
const PerfStats& perf_stats() const;
protected:
class Impl;
Impl* impl_;
};
}

View File

@@ -1,96 +0,0 @@
#pragma once
#include <simobject.h>
#include <bitmanip.h>
#include <vector>
#include "types.h"
namespace vortex {
class Core;
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t num_reqs;
uint32_t num_banks;
uint32_t bank_offset;
uint32_t latency;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, config_(config)
, bank_sel_addr_start_(config.bank_offset)
, bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
{}
virtual ~SharedMem() {}
void reset() {
perf_stats_ = PerfStats();
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = this->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = (uint32_t)bit_getw(
core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
// bank conflict check
if (in_used_banks.at(bank_id))
continue;
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.core_id};
this->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
protected:
Config config_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
};
}

View File

@@ -1,100 +0,0 @@
#include "tex_unit.h"
#include "core.h"
#include <texturing.h>
#include <VX_config.h>
using namespace vortex;
using namespace cocogfx;
enum class FilterMode {
Point,
Bilinear,
Trilinear,
};
TexUnit::TexUnit(Core* core) : core_(core) {}
TexUnit::~TexUnit() {}
void TexUnit::clear() {
for (auto& state : states_) {
state = 0;
}
}
uint32_t TexUnit::get_state(uint32_t state) {
return states_.at(state);
}
void TexUnit::set_state(uint32_t state, uint32_t value) {
states_.at(state) = value;
}
uint32_t TexUnit::read(int32_t u,
int32_t v,
int32_t lod,
std::vector<mem_addr_size_t>* mem_addrs) {
//--
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
uint32_t log_width = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
auto format = (TexFormat)states_.at(TEX_STATE_FORMAT);
auto filter = (FilterMode)states_.at(TEX_STATE_FILTER);
auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU);
auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV);
auto stride = Stride(format);
switch (filter) {
case FilterMode::Bilinear: {
// addressing
uint32_t offset00, offset01, offset10, offset11;
uint32_t alpha, beta;
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
uint32_t addr00 = base_addr + offset00 * stride;
uint32_t addr01 = base_addr + offset01 * stride;
uint32_t addr10 = base_addr + offset10 * stride;
uint32_t addr11 = base_addr + offset11 * stride;
// memory lookup
uint32_t texel00(0), texel01(0), texel10(0), texel11(0);
core_->dcache_read(&texel00, addr00, stride);
core_->dcache_read(&texel01, addr01, stride);
core_->dcache_read(&texel10, addr10, stride);
core_->dcache_read(&texel11, addr11, stride);
mem_addrs->push_back({addr00, stride});
mem_addrs->push_back({addr01, stride});
mem_addrs->push_back({addr10, stride});
mem_addrs->push_back({addr11, stride});
// filtering
auto color = TexFilterLinear(
format, texel00, texel01, texel10, texel11, alpha, beta);
return color;
}
case FilterMode::Point: {
// addressing
uint32_t offset;
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
uint32_t addr = base_addr + offset * stride;
// memory lookup
uint32_t texel(0);
core_->dcache_read(&texel, addr, stride);
mem_addrs->push_back({addr, stride});
// filtering
auto color = TexFilterPoint(format, texel);
return color;
}
default:
std::abort();
return 0;
}
}

View File

@@ -1,28 +0,0 @@
#pragma once
#include "types.h"
namespace vortex {
class Core;
class TexUnit {
public:
TexUnit(Core* core);
~TexUnit();
void clear();
uint32_t get_state(uint32_t state);
void set_state(uint32_t state, uint32_t value);
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
private:
std::array<uint32_t, NUM_TEX_STATES> states_;
Core* core_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
@@ -5,31 +18,42 @@
#include <queue>
#include <unordered_map>
#include <util.h>
#include <stringutil.h>
#include <VX_config.h>
#include <simobject.h>
#include "uuid_gen.h"
#include "debug.h"
namespace vortex {
typedef uint8_t Byte;
#if XLEN == 32
#if (XLEN == 32)
typedef uint32_t Word;
typedef int32_t WordI;
typedef uint64_t DWord;
typedef int64_t DWordI;
#elif XLEN == 64
typedef uint32_t WordF;
#elif (XLEN == 64)
typedef uint64_t Word;
typedef int64_t WordI;
typedef __uint128_t DWord;
typedef __int128_t DWordI;
typedef uint64_t WordF;
#else
#error unsupported XLEN
#endif
typedef uint64_t FWord;
#define MAX_NUM_CORES 1024
#define MAX_NUM_THREADS 32
#define MAX_NUM_WARPS 32
#define MAX_NUM_REGS 32
typedef std::bitset<32> RegMask;
typedef std::bitset<32> ThreadMask;
typedef std::bitset<32> WarpMask;
typedef std::bitset<MAX_NUM_CORES> CoreMask;
typedef std::bitset<MAX_NUM_REGS> RegMask;
typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
typedef std::bitset<MAX_NUM_WARPS> WarpMask;
typedef std::unordered_map<uint32_t, uint32_t> CSRs;
///////////////////////////////////////////////////////////////////////////////
@@ -40,8 +64,8 @@ enum class RegType {
Vector
};
inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
switch (clss) {
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
switch (type) {
case RegType::None: break;
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
@@ -53,23 +77,19 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
///////////////////////////////////////////////////////////////////////////////
enum class ExeType {
NOP,
ALU,
LSU,
CSR,
FPU,
GPU,
SFU,
MAX,
};
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
switch (type) {
case ExeType::NOP: os << "NOP"; break;
case ExeType::ALU: os << "ALU"; break;
case ExeType::LSU: os << "LSU"; break;
case ExeType::CSR: os << "CSR"; break;
case ExeType::FPU: os << "FPU"; break;
case ExeType::GPU: os << "GPU"; break;
case ExeType::SFU: os << "SFU"; break;
case ExeType::MAX: break;
}
return os;
@@ -82,8 +102,7 @@ enum class AluType {
BRANCH,
SYSCALL,
IMUL,
IDIV,
CMOV,
IDIV
};
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -93,7 +112,6 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
case AluType::SYSCALL: os << "SYSCALL"; break;
case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break;
case AluType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -103,16 +121,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
enum class LsuType {
LOAD,
STORE,
FENCE,
PREFETCH,
FENCE
};
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
switch (type) {
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
case LsuType::PREFETCH: os << "PREFETCH"; break;
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
}
return os;
}
@@ -141,21 +157,6 @@ struct mem_addr_size_t {
uint32_t size;
};
inline AddrType get_addr_type(Word addr, uint32_t size) {
__unused (size);
if (SM_ENABLE) {
if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
&& addr < SMEM_BASE_ADDR) {
assert((addr + size) <= SMEM_BASE_ADDR);
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
return AddrType::Global;
}
///////////////////////////////////////////////////////////////////////////////
enum class FpuType {
@@ -179,23 +180,31 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
///////////////////////////////////////////////////////////////////////////////
enum class GpuType {
enum class SfuType {
TMC,
WSPAWN,
SPLIT,
JOIN,
BAR,
TEX,
PRED,
CSRRW,
CSRRS,
CSRRC,
CMOV
};
inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
switch (type) {
case GpuType::TMC: os << "TMC"; break;
case GpuType::WSPAWN: os << "WSPAWN"; break;
case GpuType::SPLIT: os << "SPLIT"; break;
case GpuType::JOIN: os << "JOIN"; break;
case GpuType::BAR: os << "BAR"; break;
case GpuType::TEX: os << "TEX"; break;
case SfuType::TMC: os << "TMC"; break;
case SfuType::WSPAWN: os << "WSPAWN"; break;
case SfuType::SPLIT: os << "SPLIT"; break;
case SfuType::JOIN: os << "JOIN"; break;
case SfuType::BAR: os << "BAR"; break;
case SfuType::PRED: os << "PRED"; break;
case SfuType::CSRRW: os << "CSRRW"; break;
case SfuType::CSRRS: os << "CSRRS"; break;
case SfuType::CSRRC: os << "CSRRC"; break;
case SfuType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -218,31 +227,32 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
///////////////////////////////////////////////////////////////////////////////
struct MemReq {
uint64_t addr;
bool write;
bool non_cacheable;
uint32_t tag;
uint32_t core_id;
uint64_t uuid;
uint64_t addr;
bool write;
AddrType type;
uint32_t tag;
uint32_t cid;
uint64_t uuid;
MemReq(uint64_t _addr = 0,
bool _write = false,
bool _non_cacheable = false,
uint64_t _tag = 0,
uint32_t _core_id = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, non_cacheable(_non_cacheable)
, tag(_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
MemReq(uint64_t _addr = 0,
bool _write = false,
AddrType _type = AddrType::Global,
uint64_t _tag = 0,
uint32_t _cid = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, type(_type)
, tag(_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
os << "mem-" << (req.write ? "wr" : "rd") << ": ";
os << "addr=" << std::hex << req.addr << std::dec << ", tag=" << req.tag << ", core_id=" << req.core_id;
os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
os << " (#" << std::dec << req.uuid << ")";
return os;
}
@@ -250,18 +260,19 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
///////////////////////////////////////////////////////////////////////////////
struct MemRsp {
uint64_t tag;
uint32_t core_id;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
: tag (_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
uint64_t tag;
uint32_t cid;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
: tag (_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
os << "mem-rsp: tag=" << rsp.tag << ", cid=" << rsp.cid;
os << " (#" << std::dec << rsp.uuid << ")";
return os;
}
@@ -270,10 +281,6 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
template <typename T>
class HashTable {
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
public:
HashTable(uint32_t capacity)
: entries_(capacity)
@@ -336,92 +343,180 @@ public:
}
size_ = 0;
}
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
template <typename Req, typename Rsp>
class Switch : public SimObject<Switch<Req, Rsp>> {
private:
ArbiterType type_;
uint32_t delay_;
uint32_t cursor_;
uint32_t tag_shift_;
public:
std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn;
std::vector<SimPort<Req>> ReqOut;
std::vector<SimPort<Rsp>> RspOut;
Switch(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_inputs = 1,
uint32_t num_outputs = 1,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
: SimObject<Switch<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqOut(num_outputs, this)
, RspOut(num_outputs, this)
, type_(type)
, delay_(delay)
, cursor_(0)
, tag_shift_(log2ceil(num_inputs))
, ReqIn(num_inputs, this)
, ReqOut(this)
, RspIn(this)
, RspOut(num_inputs, this)
, cursors_(num_outputs, 0)
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
{
assert(delay_ != 0);
assert(num_inputs <= MaxInputs);
if (num_inputs == 1) {
// bypass
ReqIn.at(0).bind(&ReqOut);
RspIn.bind(&RspOut.at(0));
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
if (num_inputs == num_outputs) {
// bypass mode
for (uint32_t i = 0; i < num_inputs; ++i) {
ReqIn.at(i).bind(&ReqOut.at(i));
RspOut.at(i).bind(&RspIn.at(i));
}
}
}
void reset() {
cursor_ = 0;
for (auto& cursor : cursors_) {
cursor = 0;
}
}
void tick() {
if (ReqIn.size() == 1)
void tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
uint32_t R = 1 << lg_num_reqs_;
// skip bypass mode
if (I == O)
return;
// process incomming requests
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
uint32_t j = (cursor_ + i) % n;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
// process incomming requests
for (uint32_t o = 0; o < O; ++o) {
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (lg_num_reqs_ != 0) {
req.tag = (req.tag << lg_num_reqs_) | i;
}
DT(4, this->name() << "-" << req);
ReqOut.at(o).send(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
}
ReqOut.send(req, delay_);
req_in.pop();
this->update_cursor(j);
break;
}
}
// process incoming reponses
if (!RspIn.empty()) {
auto& rsp = RspIn.front();
uint32_t port_id = 0;
if (tag_shift_) {
port_id = rsp.tag & ((1 << tag_shift_)-1);
rsp.tag >>= tag_shift_;
}
RspOut.at(port_id).send(rsp, 1);
RspIn.pop();
// process incoming reponses
if (!RspOut.at(o).empty()) {
auto& rsp = RspOut.at(o).front();
uint32_t i = 0;
if (lg_num_reqs_ != 0) {
i = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
}
DT(4, this->name() << "-" << rsp);
uint32_t j = o * R + i;
RspIn.at(j).send(rsp, 1);
RspOut.at(o).pop();
}
}
}
void update_cursor(uint32_t grant) {
void update_cursor(uint32_t index, uint32_t grant) {
if (type_ == ArbiterType::RoundRobin) {
cursor_ = grant + 1;
cursors_.at(index) = grant + 1;
}
}
std::vector<SimPort<Req>> ReqIn;
SimPort<Req> ReqOut;
SimPort<Rsp> RspIn;
std::vector<SimPort<Rsp>> RspOut;
private:
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t lg_num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
class SMemDemux : public SimObject<SMemDemux> {
public:
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqSm;
SimPort<MemRsp> RspSm;
SimPort<MemReq> ReqDc;
SimPort<MemRsp> RspDc;
SMemDemux(
const SimContext& ctx,
const char* name,
uint32_t delay = 1
) : SimObject<SMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSm(this)
, RspSm(this)
, ReqDc(this)
, RspDc(this)
, delay_(delay)
{}
void reset() {}
void tick() {
// process incomming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSm.send(req, delay_);
} else {
ReqDc.send(req, delay_);
}
ReqIn.pop();
}
// process incoming reponses
if (!RspSm.empty()) {
auto& rsp = RspSm.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSm.pop();
}
if (!RspDc.empty()) {
auto& rsp = RspDc.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDc.pop();
}
}
private:
uint32_t delay_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
@@ -10,21 +23,25 @@
using namespace vortex;
Warp::Warp(Core *core, uint32_t id)
: id_(id)
Warp::Warp(Core *core, uint32_t warp_id)
: warp_id_(warp_id)
, arch_(core->arch())
, core_(core)
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
, vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
{
this->clear();
this->reset();
}
void Warp::clear() {
active_ = false;
PC_ = STARTUP_ADDR;
void Warp::reset() {
PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
#if (XLEN == 64)
PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
#endif
tmask_.reset();
for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i) {
issued_instrs_ = 0;
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
for (auto& reg : ireg_file_.at(i)) {
reg = 0;
}
@@ -35,31 +52,44 @@ void Warp::clear() {
reg = 0;
}
}
uui_gen_.reset();
}
void Warp::eval(pipeline_trace_t *trace) {
pipeline_trace_t* Warp::eval() {
assert(tmask_.any());
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i)
DPN(2, tmask_.test(n-i-1));
DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
/* Fetch and decode. */
#ifndef NDEBUG
uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
#else
uint64_t uuid = 0;
#endif
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
DPN(1, tmask_.test(i));
DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);
// Fetch
uint32_t instr_code = 0;
core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
auto instr = core_->decoder().decode(instr_code);
// Decode
auto instr = core_->decoder_.decode(instr_code);
if (!instr) {
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
std::abort();
}
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);
// Update trace
// Create trace
auto trace = new pipeline_trace_t(uuid, arch_);
trace->cid = core_->id();
trace->wid = id_;
trace->wid = warp_id_;
trace->PC = PC_;
trace->tmask = tmask_;
trace->rdest = instr->getRDest();
@@ -68,18 +98,20 @@ void Warp::eval(pipeline_trace_t *trace) {
// Execute
this->execute(*instr, trace);
DP(4, "Register state:");
for (uint32_t i = 0; i < core_->arch().num_regs(); ++i) {
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
DP(5, "Register state:");
for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
// Integer register file
for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, '|');
DPN(5, '|');
// Floating point register file
for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, std::endl);
DPN(5, std::endl);
}
return trace;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __WARP_H
#define __WARP_H
@@ -7,28 +20,26 @@
namespace vortex {
class Arch;
class Core;
class Instr;
class pipeline_trace_t;
struct DomStackEntry {
DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask)
, PC(PC)
, fallThrough(false)
, unanimous(false)
, fallthrough(false)
{}
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, PC(0)
, fallThrough(true)
, unanimous(false)
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, fallthrough(true)
{}
ThreadMask tmask;
Word PC;
bool fallThrough;
bool unanimous;
bool fallthrough;
};
struct vtype {
@@ -40,72 +51,58 @@ struct vtype {
class Warp {
public:
Warp(Core *core, uint32_t id);
Warp(Core *core, uint32_t warp_id);
void clear();
bool active() const {
return active_;
}
void suspend() {
active_ = false;
}
void activate() {
active_ = true;
}
std::size_t getActiveThreads() const {
if (active_)
return tmask_.count();
return 0;
}
void reset();
uint32_t id() const {
return id_;
return warp_id_;
}
uint32_t getPC() const {
Word getPC() const {
return PC_;
}
void setPC(uint32_t PC) {
void setPC(Word PC) {
PC_ = PC;
}
void setTmask(size_t index, bool value) {
tmask_.set(index, value);
active_ = tmask_.any();
}
uint32_t getTmask() const {
if (active_)
return tmask_.to_ulong();
return 0;
uint64_t getTmask() const {
return tmask_.to_ulong();
}
uint32_t getIRegValue(uint32_t reg) const {
Word getIRegValue(uint32_t reg) const {
return ireg_file_.at(0).at(reg);
}
void eval(pipeline_trace_t *);
uint64_t incr_instrs() {
return issued_instrs_++;
}
pipeline_trace_t* eval();
private:
void execute(const Instr &instr, pipeline_trace_t *trace);
UUIDGenerator uui_gen_;
uint32_t id_;
uint32_t warp_id_;
const Arch& arch_;
Core *core_;
bool active_;
uint64_t issued_instrs_;
Word PC_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<FWord>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> dom_stack_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<uint64_t>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> ipdom_stack_;
struct vtype vtype_;
uint32_t vl_;