cummulative fixes, RTL uuid trace, texture unit fixes, simx timing fixes

This commit is contained in:
Blaise Tine
2021-11-30 07:08:15 -05:00
parent b995843a5b
commit 41d7e6c63a
79 changed files with 2148 additions and 1372 deletions

View File

@@ -10,64 +10,78 @@
using namespace vortex;
NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
void NopUnit::step(uint64_t /*cycle*/) {
if (inputs_.empty())
if (Input.empty())
return;
auto trace = inputs_.top();
this->schedule_output(trace, 1);
inputs_.pop();
auto trace = Input.front();
Output.send(trace, 1);
Input.pop();
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(Core* core)
: ExeUnit("LSU")
, core_(core)
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, num_threads_(core->arch().num_threads())
, pending_dcache_(LSUQ_SIZE)
, fence_lock_(false)
{}
void LsuUnit::step(uint64_t cycle) {
__unused (cycle);
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.top();
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_dcache_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
trace->dcache_latency = latency;
this->schedule_output(trace, 1);
if (0 == entry.second) {
Output.send(trace, 1);
pending_dcache_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
// handle shared memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_dcache_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_dcache_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_dcache_.empty())
return;
this->schedule_output(fence_state_, 1);
Output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, cycle, "fence-unlock: " << fence_state_);
}
// check input queue
if (inputs_.empty())
if (Input.empty())
return;
auto trace = inputs_.top();
auto trace = Input.front();
if (trace->lsu.type == LsuType::FENCE) {
// schedule fence lock
@@ -75,179 +89,188 @@ void LsuUnit::step(uint64_t cycle) {
fence_lock_ = true;
DT(3, cycle, "fence-lock: " << *trace);
// remove input
inputs_.pop();
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (cycle - time);
return;
}
// check pending queue capacity
if (!trace->check_stalled(pending_dcache_.full())) {
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
}
if (pending_dcache_.full())
// check pending queue capacity
if (pending_dcache_.full()) {
if (!trace->suspend()) {
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
}
return;
// send memory request
bool has_shared_memory = false;
bool mem_rsp_pending = false;
} else {
trace->resume();
}
bool is_write = (trace->lsu.type == LsuType::STORE);
uint32_t valid_addrs = 0;
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(0)) {
uint64_t addr_mask = sizeof(Word)-1;
Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t valid_addrs = 0;
if (is_dup) {
valid_addrs = 1;
} else {
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
}
trace->dcache_latency = SimPlatform::instance().cycles();
auto tag = pending_dcache_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
auto mem_addr = trace->mem_addrs.at(t).at(0);
auto type = get_addr_type(mem_addr.addr, mem_addr.size);
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
for (auto mem_addr : trace->mem_addrs.at(t)) {
// check shared memory address
if (SM_ENABLE) {
if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE))
&& (mem_addr < SMEM_BASE_ADDR)) {
DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
has_shared_memory = true;
continue;
}
}
bool is_io = (mem_addr >= IO_BASE_ADDR);
MemReq mem_req;
mem_req.addr = mem_addr;
mem_req.write = is_write;
mem_req.tag = tag;
mem_req.is_io = is_io;
dcache_req_port.send(mem_req, 1);
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace);
// do not wait on writes
mem_rsp_pending = !is_write;
}
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.tag = tag;
mem_req.is_io = (type == AddrType::IO);
if (type == AddrType::Shared) {
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
} else {
dcache_req_port.send(mem_req, 2);
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace);
}
if (is_dup)
break;
}
// do not wait
if (!mem_rsp_pending) {
// do not wait on writes
if (is_write) {
pending_dcache_.release(tag);
uint32_t delay = 1;
if (has_shared_memory) {
// all threads accessed shared memory
delay += Constants::SMEM_DELAY;
}
this->schedule_output(trace, delay);
Output.send(trace, 1);
}
// remove input
inputs_.pop();
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (cycle - time);
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::step(uint64_t /*cycle*/) {
if (inputs_.empty())
void AluUnit::step(uint64_t cycle) {
if (Input.empty())
return;
auto trace = inputs_.top();
auto trace = Input.front();
switch (trace->alu.type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::CMOV:
this->schedule_output(trace, 1);
inputs_.pop();
Output.send(trace, 1);
break;
case AluType::IMUL:
this->schedule_output(trace, LATENCY_IMUL);
inputs_.pop();
Output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
this->schedule_output(trace, XLEN);
inputs_.pop();
Output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.alu_stalls += (cycle - time);
}
///////////////////////////////////////////////////////////////////////////////
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
void CsrUnit::step(uint64_t /*cycle*/) {
if (inputs_.empty())
void CsrUnit::step(uint64_t cycle) {
if (Input.empty())
return;
auto trace = inputs_.top();
this->schedule_output(trace, 1);
inputs_.pop();
auto trace = Input.front();
Output.send(trace, 1);
auto time = Input.pop();
core_->perf_stats_.csr_stalls += (cycle - time);
DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::step(uint64_t /*cycle*/) {
if (inputs_.empty())
void FpuUnit::step(uint64_t cycle) {
if (Input.empty())
return;
auto trace = inputs_.top();
auto trace = Input.front();
switch (trace->fpu.type) {
case FpuType::FNCP:
this->schedule_output(trace, 1);
inputs_.pop();
Output.send(trace, 2);
break;
case FpuType::FMA:
this->schedule_output(trace, LATENCY_FMA);
inputs_.pop();
Output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
this->schedule_output(trace, LATENCY_FDIV);
inputs_.pop();
Output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
this->schedule_output(trace, LATENCY_FSQRT);
inputs_.pop();
Output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
this->schedule_output(trace, LATENCY_FCVT);
inputs_.pop();
Output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
}
DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (cycle - time);
}
///////////////////////////////////////////////////////////////////////////////
GpuUnit::GpuUnit(Core* core)
: ExeUnit("GPU")
, core_(core)
GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "GPU")
, num_threads_(core->arch().num_threads())
, pending_tex_reqs_(TEXQ_SIZE)
{}
void GpuUnit::step(uint64_t cycle) {
__unused (cycle);
#ifdef EXT_TEX_ENABLE
// handle memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.top();
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
trace->dcache_latency = latency;
this->schedule_output(trace, 1);
if (0 == entry.second) {
Output.send(trace, 1);
pending_tex_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
@@ -255,38 +278,67 @@ void GpuUnit::step(uint64_t cycle) {
#endif
// check input queue
if (inputs_.empty())
if (Input.empty())
return;
auto trace = inputs_.top();
auto trace = Input.front();
bool issued = false;
switch (trace->gpu.type) {
case GpuType::TMC:
Output.send(trace, 1);
core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
issued = true;
break;
case GpuType::WSPAWN:
Output.send(trace, 1);
core_->active_warps_ = trace->gpu.active_warps;
issued = true;
break;
case GpuType::SPLIT:
case GpuType::JOIN:
case GpuType::BAR:
this->schedule_output(trace, 1);
inputs_.pop();
Output.send(trace, 1);
issued = true;
break;
case GpuType::TEX: {
case GpuType::BAR:
Output.send(trace, 1);
if (trace->gpu.active_warps != 0)
core_->active_warps_ |= trace->gpu.active_warps;
else
core_->active_warps_.reset(trace->wid);
issued = true;
break;
case GpuType::TEX:
if (this->processTexRequest(cycle, trace))
inputs_.pop();
} break;
issued = true;
break;
default:
std::abort();
}
if (issued) {
DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (cycle - time);
}
}
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
__unused (cycle);
// check pending queue capacity
if (!trace->check_stalled(pending_tex_reqs_.full())) {
DT(3, cycle, "*** tex-queue-stall: " << *trace);
}
if (pending_tex_reqs_.full())
// check pending queue capacity
if (pending_tex_reqs_.full()) {
if (!trace->suspend()) {
DT(3, cycle, "*** tex-queue-stall: " << *trace);
}
return false;
} else {
trace->resume();
}
// send memory request
@@ -295,7 +347,6 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
valid_addrs += mem_addr.size();
}
trace->tex_latency = SimPlatform::instance().cycles();
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
@@ -305,12 +356,14 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
for (auto mem_addr : trace->mem_addrs.at(t)) {
MemReq mem_req;
mem_req.addr = mem_addr;
mem_req.addr = mem_addr.addr;
mem_req.write = (trace->lsu.type == LsuType::STORE);
mem_req.tag = tag;
dcache_req_port.send(mem_req, 1);
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag
dcache_req_port.send(mem_req, 3);
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", tid=" << t << ", "<< trace);
++ core_->perf_stats_.tex_reads;
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
}
}