+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
702 lines
23 KiB
C++
702 lines
23 KiB
C++
// Copyright © 2019-2023
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <util.h>
|
|
#include "types.h"
|
|
#include "arch.h"
|
|
#include "mem.h"
|
|
#include "decode.h"
|
|
#include "core.h"
|
|
#include "debug.h"
|
|
#include "constants.h"
|
|
#include "processor_impl.h"
|
|
|
|
using namespace vortex;
|
|
|
|
Core::Core(const SimContext& ctx,
|
|
uint32_t core_id,
|
|
Cluster* cluster,
|
|
const Arch &arch,
|
|
const DCRS &dcrs,
|
|
SharedMem::Ptr sharedmem)
|
|
: SimObject(ctx, "core")
|
|
, icache_req_ports(1, this)
|
|
, icache_rsp_ports(1, this)
|
|
, dcache_req_ports(NUM_LSU_LANES, this)
|
|
, dcache_rsp_ports(NUM_LSU_LANES, this)
|
|
, core_id_(core_id)
|
|
, arch_(arch)
|
|
, dcrs_(dcrs)
|
|
, decoder_(arch)
|
|
, warps_(arch.num_warps())
|
|
, barriers_(arch.num_barriers(), 0)
|
|
, fcsrs_(arch.num_warps(), 0)
|
|
, ibuffers_(ISSUE_WIDTH, IBUF_SIZE)
|
|
, scoreboard_(arch_)
|
|
, operands_(ISSUE_WIDTH)
|
|
, dispatchers_((uint32_t)ExeType::MAX)
|
|
, exe_units_((uint32_t)ExeType::MAX)
|
|
, sharedmem_(sharedmem)
|
|
, fetch_latch_("fetch")
|
|
, decode_latch_("decode")
|
|
, pending_icache_(arch_.num_warps())
|
|
, committed_traces_(ISSUE_WIDTH, nullptr)
|
|
, csrs_(arch.num_warps())
|
|
, cluster_(cluster)
|
|
{
|
|
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
|
csrs_.at(i).resize(arch.num_threads());
|
|
}
|
|
|
|
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
|
warps_.at(i) = std::make_shared<Warp>(this, i);
|
|
}
|
|
|
|
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
|
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
|
|
}
|
|
|
|
// initialize dispatchers
|
|
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
|
|
dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
|
|
dispatchers_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_LSU_LANES);
|
|
dispatchers_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_SFU_LANES);
|
|
|
|
// initialize execute units
|
|
exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
|
|
exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
|
|
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
|
|
exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
|
|
|
|
this->reset();
|
|
}
|
|
|
|
Core::~Core() {
|
|
this->cout_flush();
|
|
}
|
|
|
|
void Core::reset() {
|
|
for (auto& warp : warps_) {
|
|
warp->reset();
|
|
}
|
|
warps_.at(0)->setTmask(0, true);
|
|
active_warps_ = 1;
|
|
|
|
for (auto& exe_unit : exe_units_) {
|
|
exe_unit->reset();
|
|
}
|
|
|
|
for ( auto& barrier : barriers_) {
|
|
barrier.reset();
|
|
}
|
|
|
|
for (auto& fcsr : fcsrs_) {
|
|
fcsr = 0;
|
|
}
|
|
|
|
for (auto& ibuf : ibuffers_) {
|
|
ibuf.clear();
|
|
}
|
|
|
|
commit_exe_= 0;
|
|
|
|
scoreboard_.clear();
|
|
fetch_latch_.clear();
|
|
decode_latch_.clear();
|
|
pending_icache_.clear();
|
|
stalled_warps_.reset();
|
|
issued_instrs_ = 0;
|
|
committed_instrs_ = 0;
|
|
exited_ = false;
|
|
perf_stats_ = PerfStats();
|
|
pending_ifetches_ = 0;
|
|
}
|
|
|
|
void Core::tick() {
|
|
this->commit();
|
|
this->execute();
|
|
this->issue();
|
|
this->decode();
|
|
this->fetch();
|
|
this->schedule();
|
|
|
|
++perf_stats_.cycles;
|
|
DPN(2, std::flush);
|
|
}
|
|
|
|
void Core::schedule() {
|
|
int scheduled_warp = -1;
|
|
|
|
// find next ready warp
|
|
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
|
|
bool warp_active = active_warps_.test(wid);
|
|
bool warp_stalled = stalled_warps_.test(wid);
|
|
if (warp_active && !warp_stalled) {
|
|
scheduled_warp = wid;
|
|
break;
|
|
}
|
|
}
|
|
if (scheduled_warp == -1)
|
|
return;
|
|
|
|
// suspend warp until decode
|
|
stalled_warps_.set(scheduled_warp);
|
|
|
|
// evaluate scheduled warp
|
|
auto& warp = warps_.at(scheduled_warp);
|
|
auto trace = warp->eval();
|
|
|
|
DT(3, "pipeline-schedule: " << *trace);
|
|
|
|
// advance to fetch stage
|
|
fetch_latch_.push(trace);
|
|
++issued_instrs_;
|
|
}
|
|
|
|
void Core::fetch() {
|
|
perf_stats_.ifetch_latency += pending_ifetches_;
|
|
|
|
// handle icache reponse
|
|
auto& icache_rsp_port = icache_rsp_ports.at(0);
|
|
if (!icache_rsp_port.empty()){
|
|
auto& mem_rsp = icache_rsp_port.front();
|
|
auto trace = pending_icache_.at(mem_rsp.tag);
|
|
decode_latch_.push(trace);
|
|
DT(3, "icache-rsp: addr=0x" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
|
|
pending_icache_.release(mem_rsp.tag);
|
|
icache_rsp_port.pop();
|
|
--pending_ifetches_;
|
|
}
|
|
|
|
// send icache request
|
|
if (fetch_latch_.empty())
|
|
return;
|
|
auto trace = fetch_latch_.front();
|
|
MemReq mem_req;
|
|
mem_req.addr = trace->PC;
|
|
mem_req.write = false;
|
|
mem_req.tag = pending_icache_.allocate(trace);
|
|
mem_req.cid = trace->cid;
|
|
mem_req.uuid = trace->uuid;
|
|
icache_req_ports.at(0).send(mem_req, 1);
|
|
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
|
fetch_latch_.pop();
|
|
++pending_ifetches_;
|
|
++perf_stats_.ifetches;
|
|
}
|
|
|
|
void Core::decode() {
|
|
if (decode_latch_.empty())
|
|
return;
|
|
|
|
auto trace = decode_latch_.front();
|
|
|
|
// check ibuffer capacity
|
|
auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH);
|
|
if (ibuffer.full()) {
|
|
if (!trace->log_once(true)) {
|
|
DT(3, "*** ibuffer-stall: " << *trace);
|
|
}
|
|
++perf_stats_.ibuf_stalls;
|
|
return;
|
|
} else {
|
|
trace->log_once(false);
|
|
}
|
|
|
|
// release warp
|
|
if (!trace->fetch_stall) {
|
|
assert(stalled_warps_.test(trace->wid));
|
|
stalled_warps_.reset(trace->wid);
|
|
}
|
|
|
|
// update perf counters
|
|
uint32_t active_threads = trace->tmask.count();
|
|
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD)
|
|
perf_stats_.loads += active_threads;
|
|
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
|
|
perf_stats_.stores += active_threads;
|
|
|
|
DT(3, "pipeline-decode: " << *trace);
|
|
|
|
// insert to ibuffer
|
|
ibuffer.push(trace);
|
|
|
|
decode_latch_.pop();
|
|
}
|
|
|
|
void Core::issue() {
|
|
// operands to dispatch
|
|
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
|
auto& operand = operands_.at(i);
|
|
if (operand->Output.empty())
|
|
continue;
|
|
auto trace = operand->Output.front();
|
|
if (dispatchers_.at((int)trace->exe_type)->push(i, trace)) {
|
|
operand->Output.pop();
|
|
trace->log_once(false);
|
|
} else {
|
|
if (!trace->log_once(true)) {
|
|
DT(3, "*** dispatch-stall: " << *trace);
|
|
}
|
|
}
|
|
}
|
|
|
|
// issue ibuffer instructions
|
|
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
|
auto& ibuffer = ibuffers_.at(i);
|
|
if (ibuffer.empty())
|
|
continue;
|
|
|
|
auto trace = ibuffer.top();
|
|
|
|
// check scoreboard
|
|
if (scoreboard_.in_use(trace)) {
|
|
if (!trace->log_once(true)) {
|
|
DTH(3, "*** scoreboard-stall: dependents={");
|
|
auto uses = scoreboard_.get_uses(trace);
|
|
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
|
auto& use = uses.at(j);
|
|
__unused (use);
|
|
if (j) DTN(3, ", ");
|
|
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
|
|
}
|
|
DTN(3, "}, " << *trace << std::endl);
|
|
}
|
|
++perf_stats_.scrb_stalls;
|
|
continue;
|
|
} else {
|
|
trace->log_once(false);
|
|
}
|
|
|
|
// update scoreboard
|
|
if (trace->wb) {
|
|
scoreboard_.reserve(trace);
|
|
}
|
|
|
|
DT(3, "pipeline-scoreboard: " << *trace);
|
|
|
|
// to operand stage
|
|
operands_.at(i)->Input.send(trace, 1);
|
|
|
|
ibuffer.pop();
|
|
}
|
|
}
|
|
|
|
void Core::execute() {
|
|
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
|
|
auto& dispatch = dispatchers_.at(i);
|
|
auto& exe_unit = exe_units_.at(i);
|
|
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
|
|
if (dispatch->Outputs.at(j).empty())
|
|
continue;
|
|
auto trace = dispatch->Outputs.at(j).front();
|
|
exe_unit->Inputs.at(j).send(trace, 1);
|
|
dispatch->Outputs.at(j).pop();
|
|
}
|
|
}
|
|
}
|
|
|
|
void Core::commit() {
|
|
// process completed instructions
|
|
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
|
auto trace = committed_traces_.at(i);
|
|
if (!trace)
|
|
continue;
|
|
committed_traces_.at(i) = nullptr;
|
|
|
|
// advance to commit stage
|
|
DT(3, "pipeline-commit: " << *trace);
|
|
assert(trace->cid == core_id_);
|
|
|
|
// update scoreboard
|
|
if (trace->eop) {
|
|
if (trace->wb) {
|
|
scoreboard_.release(trace);
|
|
}
|
|
|
|
assert(committed_instrs_ <= issued_instrs_);
|
|
++committed_instrs_;
|
|
|
|
perf_stats_.instrs += trace->tmask.count();
|
|
}
|
|
|
|
// delete the trace
|
|
delete trace;
|
|
}
|
|
|
|
// select completed instructions
|
|
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
|
|
uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX;
|
|
auto& exe_unit = exe_units_.at(ii);
|
|
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
|
|
auto committed_trace = committed_traces_.at(j);
|
|
if (committed_trace)
|
|
continue;
|
|
auto& output = exe_unit->Outputs.at(j);
|
|
if (output.empty())
|
|
continue;
|
|
auto trace = output.front();
|
|
committed_traces_.at(j) = trace;
|
|
output.pop();
|
|
}
|
|
}
|
|
++commit_exe_;
|
|
}
|
|
|
|
void Core::wspawn(uint32_t num_warps, Word nextPC) {
|
|
uint32_t active_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
|
|
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
|
|
for (uint32_t i = 1; i < active_warps; ++i) {
|
|
auto warp = warps_.at(i);
|
|
warp->setPC(nextPC);
|
|
warp->setTmask(0, true);
|
|
active_warps_.set(i);
|
|
}
|
|
}
|
|
|
|
void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
|
|
uint32_t bar_idx = bar_id & 0x7fffffff;
|
|
bool is_global = (bar_id >> 31);
|
|
|
|
auto& barrier = barriers_.at(bar_idx);
|
|
barrier.set(warp_id);
|
|
DP(3, "*** Suspend core #" << core_id_ << ", warp #" << warp_id << " at barrier #" << bar_idx);
|
|
|
|
if (is_global) {
|
|
// global barrier handling
|
|
if (barrier.count() == active_warps_.count()) {
|
|
cluster_->barrier(bar_idx, count, core_id_);
|
|
barrier.reset();
|
|
}
|
|
} else {
|
|
// local barrier handling
|
|
if (barrier.count() == (size_t)count) {
|
|
// resume suspended warps
|
|
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
|
if (barrier.test(i)) {
|
|
DP(3, "*** Resume core #" << core_id_ << ", warp #" << i << " at barrier #" << bar_idx);
|
|
stalled_warps_.reset(i);
|
|
}
|
|
}
|
|
barrier.reset();
|
|
}
|
|
}
|
|
}
|
|
|
|
void Core::icache_read(void *data, uint64_t addr, uint32_t size) {
|
|
mmu_.read(data, addr, size, 0);
|
|
}
|
|
|
|
AddrType Core::get_addr_type(uint64_t addr) {
|
|
if (SM_ENABLED) {
|
|
if (addr >= SMEM_BASE_ADDR && addr < (SMEM_BASE_ADDR + (1 << SMEM_LOG_SIZE))) {
|
|
return AddrType::Shared;
|
|
}
|
|
}
|
|
if (addr >= IO_BASE_ADDR) {
|
|
return AddrType::IO;
|
|
}
|
|
return AddrType::Global;
|
|
}
|
|
|
|
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
|
|
auto type = this->get_addr_type(addr);
|
|
if (type == AddrType::Shared) {
|
|
sharedmem_->read(data, addr, size);
|
|
} else {
|
|
mmu_.read(data, addr, size, 0);
|
|
}
|
|
|
|
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
|
|
}
|
|
|
|
void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
|
|
auto type = this->get_addr_type(addr);
|
|
if (addr >= uint64_t(IO_COUT_ADDR)
|
|
&& addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
|
this->writeToStdOut(data, addr, size);
|
|
} else {
|
|
if (type == AddrType::Shared) {
|
|
sharedmem_->write(data, addr, size);
|
|
} else {
|
|
mmu_.write(data, addr, size, 0);
|
|
}
|
|
}
|
|
DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
|
|
}
|
|
|
|
void Core::dcache_amo_reserve(uint64_t addr) {
|
|
auto type = this->get_addr_type(addr);
|
|
if (type == AddrType::Global) {
|
|
mmu_.amo_reserve(addr);
|
|
}
|
|
}
|
|
|
|
bool Core::dcache_amo_check(uint64_t addr) {
|
|
auto type = this->get_addr_type(addr);
|
|
if (type == AddrType::Global) {
|
|
return mmu_.amo_check(addr);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void Core::writeToStdOut(const void* data, uint64_t addr, uint32_t size) {
|
|
if (size != 1)
|
|
std::abort();
|
|
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
|
|
auto& ss_buf = print_bufs_[tid];
|
|
char c = *(char*)data;
|
|
ss_buf << c;
|
|
if (c == '\n') {
|
|
std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush;
|
|
ss_buf.str("");
|
|
}
|
|
}
|
|
|
|
void Core::cout_flush() {
|
|
for (auto& buf : print_bufs_) {
|
|
auto str = buf.second.str();
|
|
if (!str.empty()) {
|
|
std::cout << "#" << buf.first << ": " << str << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|
switch (addr) {
|
|
case VX_CSR_SATP:
|
|
case VX_CSR_PMPCFG0:
|
|
case VX_CSR_PMPADDR0:
|
|
case VX_CSR_MSTATUS:
|
|
case VX_CSR_MISA:
|
|
case VX_CSR_MEDELEG:
|
|
case VX_CSR_MIDELEG:
|
|
case VX_CSR_MIE:
|
|
case VX_CSR_MTVEC:
|
|
case VX_CSR_MEPC:
|
|
case VX_CSR_MNSTATUS:
|
|
return 0;
|
|
|
|
case VX_CSR_FFLAGS:
|
|
return fcsrs_.at(wid) & 0x1F;
|
|
case VX_CSR_FRM:
|
|
return (fcsrs_.at(wid) >> 5);
|
|
case VX_CSR_FCSR:
|
|
return fcsrs_.at(wid);
|
|
case VX_CSR_MHARTID: // global thread ID
|
|
return (core_id_ * arch_.num_warps() + wid) * arch_.num_threads() + tid;
|
|
case VX_CSR_THREAD_ID: // thread ID
|
|
return tid;
|
|
case VX_CSR_WARP_ID: // warp ID
|
|
return wid;
|
|
case VX_CSR_CORE_ID: // core ID
|
|
return core_id_;
|
|
case VX_CSR_THREAD_MASK: // thread mask
|
|
return warps_.at(wid)->getTmask();
|
|
case VX_CSR_WARP_MASK: // active warps
|
|
return active_warps_.to_ulong();
|
|
case VX_CSR_NUM_THREADS: // Number of threads per warp
|
|
return arch_.num_threads();
|
|
case VX_CSR_NUM_WARPS: // Number of warps per core
|
|
return arch_.num_warps();
|
|
case VX_CSR_NUM_CORES: // Number of cores per cluster
|
|
return uint32_t(arch_.num_cores()) * arch_.num_clusters();
|
|
case VX_CSR_MCYCLE: // NumCycles
|
|
return perf_stats_.cycles & 0xffffffff;
|
|
case VX_CSR_MCYCLE_H: // NumCycles
|
|
return (uint32_t)(perf_stats_.cycles >> 32);
|
|
case VX_CSR_MINSTRET: // NumInsts
|
|
return perf_stats_.instrs & 0xffffffff;
|
|
case VX_CSR_MINSTRET_H: // NumInsts
|
|
return (uint32_t)(perf_stats_.instrs >> 32);
|
|
default:
|
|
if ((addr >= VX_CSR_MPM_BASE && addr < (VX_CSR_MPM_BASE + 32))
|
|
|| (addr >= VX_CSR_MPM_BASE_H && addr < (VX_CSR_MPM_BASE_H + 32))) {
|
|
// user-defined MPM CSRs
|
|
auto perf_class = dcrs_.base_dcrs.read(VX_DCR_BASE_MPM_CLASS);
|
|
switch (perf_class) {
|
|
case VX_DCR_MPM_CLASS_NONE:
|
|
break;
|
|
case VX_DCR_MPM_CLASS_CORE: {
|
|
switch (addr) {
|
|
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
|
|
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
|
|
case VX_CSR_MPM_ALU_ST: return perf_stats_.alu_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_ALU_ST_H: return perf_stats_.alu_stalls >> 32;
|
|
case VX_CSR_MPM_LSU_ST: return perf_stats_.lsu_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_LSU_ST_H: return perf_stats_.lsu_stalls >> 32;
|
|
case VX_CSR_MPM_FPU_ST: return perf_stats_.fpu_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32;
|
|
case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32;
|
|
|
|
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
|
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
|
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
|
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
|
|
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
|
|
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
|
|
case VX_CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff;
|
|
case VX_CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32;
|
|
case VX_CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff;
|
|
case VX_CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32;
|
|
}
|
|
} break;
|
|
case VX_DCR_MPM_CLASS_MEM: {
|
|
auto proc_perf = cluster_->processor()->perf_stats();
|
|
switch (addr) {
|
|
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
|
|
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
|
|
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
|
|
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
|
|
|
|
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
|
|
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
|
|
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
|
|
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
|
|
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
|
|
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
|
|
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
|
|
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
|
|
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32;
|
|
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32;
|
|
|
|
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
|
|
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
|
|
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
|
|
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
|
|
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32;
|
|
|
|
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
|
|
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
|
|
case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff;
|
|
case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32;
|
|
case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff;
|
|
case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
|
|
case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
|
|
case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
|
|
case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
|
|
case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
|
|
|
|
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
|
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
|
case VX_CSR_MPM_L3CACHE_WRITES: return proc_perf.l3cache.writes & 0xffffffff;
|
|
case VX_CSR_MPM_L3CACHE_WRITES_H: return proc_perf.l3cache.writes >> 32;
|
|
case VX_CSR_MPM_L3CACHE_MISS_R: return proc_perf.l3cache.read_misses & 0xffffffff;
|
|
case VX_CSR_MPM_L3CACHE_MISS_R_H: return proc_perf.l3cache.read_misses >> 32;
|
|
case VX_CSR_MPM_L3CACHE_MISS_W: return proc_perf.l3cache.write_misses & 0xffffffff;
|
|
case VX_CSR_MPM_L3CACHE_MISS_W_H: return proc_perf.l3cache.write_misses >> 32;
|
|
case VX_CSR_MPM_L3CACHE_BANK_ST: return proc_perf.l3cache.bank_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_L3CACHE_BANK_ST_H:return proc_perf.l3cache.bank_stalls >> 32;
|
|
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
|
|
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
|
|
|
|
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
|
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
|
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
|
case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32;
|
|
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
|
|
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
|
|
}
|
|
} break;
|
|
}
|
|
} else {
|
|
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
|
std::abort();
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void Core::set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid) {
|
|
__unused (tid);
|
|
switch (addr) {
|
|
case VX_CSR_FFLAGS:
|
|
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F);
|
|
break;
|
|
case VX_CSR_FRM:
|
|
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5);
|
|
break;
|
|
case VX_CSR_FCSR:
|
|
fcsrs_.at(wid) = value & 0xff;
|
|
break;
|
|
case VX_CSR_SATP:
|
|
case VX_CSR_MSTATUS:
|
|
case VX_CSR_MEDELEG:
|
|
case VX_CSR_MIDELEG:
|
|
case VX_CSR_MIE:
|
|
case VX_CSR_MTVEC:
|
|
case VX_CSR_MEPC:
|
|
case VX_CSR_PMPCFG0:
|
|
case VX_CSR_PMPADDR0:
|
|
case VX_CSR_MNSTATUS:
|
|
break;
|
|
default:
|
|
{
|
|
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
|
|
std::abort();
|
|
}
|
|
}
|
|
}
|
|
|
|
void Core::trigger_ecall() {
|
|
active_warps_.reset();
|
|
exited_ = true;
|
|
}
|
|
|
|
void Core::trigger_ebreak() {
|
|
active_warps_.reset();
|
|
exited_ = true;
|
|
}
|
|
|
|
bool Core::check_exit(Word* exitcode, bool riscv_test) const {
|
|
if (exited_) {
|
|
Word ec = warps_.at(0)->getIRegValue(3);
|
|
if (riscv_test) {
|
|
*exitcode = (1 - ec);
|
|
} else {
|
|
*exitcode = ec;
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Core::running() const {
|
|
return (committed_instrs_ != issued_instrs_);
|
|
}
|
|
|
|
void Core::resume() {
|
|
stalled_warps_.reset();
|
|
}
|
|
|
|
void Core::attach_ram(RAM* ram) {
|
|
// bind RAM to memory unit
|
|
#if (XLEN == 64)
|
|
mmu_.attach(*ram, 0, 0xFFFFFFFFFFFFFFFF);
|
|
#else
|
|
mmu_.attach(*ram, 0, 0xFFFFFFFF);
|
|
#endif
|
|
}
|