fixes: texture unit mem access sometimes going to smem, bilinear texture filtering; new: cache req_id,
This commit is contained in:
@@ -6,16 +6,18 @@
|
||||
#include <util.h>
|
||||
#include "debug.h"
|
||||
#include "core.h"
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
|
||||
|
||||
void NopUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
this->schedule_output(state, 1);
|
||||
auto trace = inputs_.top();
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -33,19 +35,23 @@ void LsuUnit::step(uint64_t cycle) {
|
||||
|
||||
// handle dcache response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
MemRsp mem_rsp;
|
||||
if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp))
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first);
|
||||
assert(entry.second.test(t));
|
||||
entry.second.reset(t); // track remaining blocks
|
||||
if (!entry.second.any()) {
|
||||
auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
|
||||
entry.first.dcache_latency = latency;
|
||||
this->schedule_output(entry.first, 1);
|
||||
auto& mem_rsp = dcache_rsp_port.top();
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
|
||||
trace->dcache_latency = latency;
|
||||
this->schedule_output(trace, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
}
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
}
|
||||
|
||||
if (fence_lock_) {
|
||||
@@ -61,36 +67,83 @@ void LsuUnit::step(uint64_t cycle) {
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
|
||||
auto state = inputs_.top();
|
||||
auto trace = inputs_.top();
|
||||
|
||||
if (state.lsu.type == LsuType::FENCE) {
|
||||
if (trace->lsu.type == LsuType::FENCE) {
|
||||
// schedule fence lock
|
||||
fence_state_ = state;
|
||||
fence_lock_ = true;
|
||||
inputs_.pop();
|
||||
DT(3, cycle, "fence-lock: " << state);
|
||||
fence_state_ = trace;
|
||||
fence_lock_ = true;
|
||||
DT(3, cycle, "fence-lock: " << *trace);
|
||||
// remove input
|
||||
inputs_.pop();
|
||||
return;
|
||||
}
|
||||
|
||||
// check pending queue capacity
|
||||
if (pending_dcache_.full()) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << state);
|
||||
if (!trace->check_stalled(pending_dcache_.full())) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
|
||||
}
|
||||
if (pending_dcache_.full())
|
||||
return;
|
||||
|
||||
// send memory request
|
||||
|
||||
bool has_shared_memory = false;
|
||||
bool mem_rsp_pending = false;
|
||||
bool is_write = (trace->lsu.type == LsuType::STORE);
|
||||
|
||||
uint32_t valid_addrs = 0;
|
||||
for (auto& mem_addr : trace->mem_addrs) {
|
||||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
|
||||
trace->dcache_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_dcache_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
|
||||
for (auto mem_addr : trace->mem_addrs.at(t)) {
|
||||
// check shared memory address
|
||||
if (SM_ENABLE) {
|
||||
if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE))
|
||||
&& (mem_addr < SMEM_BASE_ADDR)) {
|
||||
DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
|
||||
has_shared_memory = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_io = (mem_addr >= IO_BASE_ADDR);
|
||||
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.tag = tag;
|
||||
mem_req.is_io = is_io;
|
||||
dcache_req_port.send(mem_req, 1);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace);
|
||||
// do not wait on writes
|
||||
mem_rsp_pending = !is_write;
|
||||
}
|
||||
}
|
||||
|
||||
// send dcache request
|
||||
state.dcache_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_dcache_.allocate({state, state.tmask});
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!state.tmask.test(t))
|
||||
continue;
|
||||
MemReq mem_req;
|
||||
mem_req.addr = state.mem_addrs.at(t);
|
||||
mem_req.write = (state.lsu.type == LsuType::STORE);
|
||||
mem_req.tag = tag;
|
||||
core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state);
|
||||
}
|
||||
// do not wait
|
||||
if (!mem_rsp_pending) {
|
||||
pending_dcache_.release(tag);
|
||||
uint32_t delay = 1;
|
||||
if (has_shared_memory) {
|
||||
// all threads accessed shared memory
|
||||
delay += Constants::SMEM_DELAY;
|
||||
}
|
||||
this->schedule_output(trace, delay);
|
||||
}
|
||||
|
||||
// remove input
|
||||
inputs_.pop();
|
||||
}
|
||||
|
||||
@@ -98,23 +151,27 @@ void LsuUnit::step(uint64_t cycle) {
|
||||
|
||||
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
|
||||
|
||||
void AluUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
void AluUnit::step(uint64_t /*cycle*/) {
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
switch (state.alu.type) {
|
||||
case AluType::ARITH:
|
||||
this->schedule_output(state, 1);
|
||||
break;
|
||||
auto trace = inputs_.top();
|
||||
switch (trace->alu.type) {
|
||||
case AluType::ARITH:
|
||||
case AluType::BRANCH:
|
||||
this->schedule_output(state, 1);
|
||||
case AluType::CMOV:
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case AluType::IMUL:
|
||||
this->schedule_output(state, LATENCY_IMUL);
|
||||
this->schedule_output(trace, LATENCY_IMUL);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case AluType::IDIV:
|
||||
this->schedule_output(state, XLEN);
|
||||
this->schedule_output(trace, XLEN);
|
||||
inputs_.pop();
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,10 +180,11 @@ void AluUnit::step(uint64_t /*cycle*/) {
|
||||
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
|
||||
|
||||
void CsrUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
this->schedule_output(state, 1);
|
||||
auto trace = inputs_.top();
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -134,46 +192,127 @@ void CsrUnit::step(uint64_t /*cycle*/) {
|
||||
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
|
||||
|
||||
void FpuUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
switch (state.fpu.type) {
|
||||
auto trace = inputs_.top();
|
||||
switch (trace->fpu.type) {
|
||||
case FpuType::FNCP:
|
||||
this->schedule_output(state, 1);
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case FpuType::FMA:
|
||||
this->schedule_output(state, LATENCY_FMA);
|
||||
this->schedule_output(trace, LATENCY_FMA);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case FpuType::FDIV:
|
||||
this->schedule_output(state, LATENCY_FDIV);
|
||||
this->schedule_output(trace, LATENCY_FDIV);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case FpuType::FSQRT:
|
||||
this->schedule_output(state, LATENCY_FSQRT);
|
||||
this->schedule_output(trace, LATENCY_FSQRT);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case FpuType::FCVT:
|
||||
this->schedule_output(state, LATENCY_FCVT);
|
||||
this->schedule_output(trace, LATENCY_FCVT);
|
||||
inputs_.pop();
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
|
||||
GpuUnit::GpuUnit(Core* core)
|
||||
: ExeUnit("GPU")
|
||||
, core_(core)
|
||||
, num_threads_(core->arch().num_threads())
|
||||
, pending_tex_reqs_(TEXQ_SIZE)
|
||||
{}
|
||||
|
||||
void GpuUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
void GpuUnit::step(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// handle memory response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.top();
|
||||
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
|
||||
trace->dcache_latency = latency;
|
||||
this->schedule_output(trace, 1);
|
||||
pending_tex_reqs_.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
}
|
||||
#endif
|
||||
|
||||
// check input queue
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
switch (state.gpu.type) {
|
||||
|
||||
auto trace = inputs_.top();
|
||||
|
||||
switch (trace->gpu.type) {
|
||||
case GpuType::TMC:
|
||||
case GpuType::WSPAWN:
|
||||
case GpuType::SPLIT:
|
||||
case GpuType::JOIN:
|
||||
case GpuType::BAR:
|
||||
this->schedule_output(state, 1);
|
||||
break;
|
||||
case GpuType::TEX:
|
||||
/* TODO */
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case GpuType::TEX: {
|
||||
if (this->processTexRequest(cycle, trace))
|
||||
inputs_.pop();
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
|
||||
__unused (cycle);
|
||||
|
||||
// check pending queue capacity
|
||||
if (!trace->check_stalled(pending_tex_reqs_.full())) {
|
||||
DT(3, cycle, "*** tex-queue-stall: " << *trace);
|
||||
}
|
||||
if (pending_tex_reqs_.full())
|
||||
return false;
|
||||
|
||||
// send memory request
|
||||
|
||||
uint32_t valid_addrs = 0;
|
||||
for (auto& mem_addr : trace->mem_addrs) {
|
||||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
|
||||
trace->tex_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
|
||||
for (auto mem_addr : trace->mem_addrs.at(t)) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr;
|
||||
mem_req.write = (trace->lsu.type == LsuType::STORE);
|
||||
mem_req.tag = tag;
|
||||
dcache_req_port.send(mem_req, 1);
|
||||
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
<< ", tid=" << t << ", "<< trace);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
Reference in New Issue
Block a user