cummulative fixes, RTL uuid trace, texture unit fixes, simx timing fixes

2021-11-30 07:08:15 -05:00
parent b995843a5b
commit 41d7e6c63a
79 changed files with 2148 additions and 1372 deletions
--- a/sim/simX/exeunit.cpp
+++ b/sim/simX/exeunit.cpp
@@ -10,64 +10,78 @@

 using namespace vortex;

-NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
+NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
    
 void NopUnit::step(uint64_t /*cycle*/) {
-    if (inputs_.empty()) 
+    if (Input.empty()) 
        return;
-    auto trace = inputs_.top();
-    this->schedule_output(trace, 1);
-    inputs_.pop();
+    auto trace = Input.front();
+    Output.send(trace, 1);
+    Input.pop();
 }

 ///////////////////////////////////////////////////////////////////////////////

-LsuUnit::LsuUnit(Core* core) 
-    : ExeUnit("LSU")
-    , core_(core)
+LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
+    : ExeUnit(ctx, core, "LSU")
    , num_threads_(core->arch().num_threads()) 
    , pending_dcache_(LSUQ_SIZE)
    , fence_lock_(false)
 {}

 void LsuUnit::step(uint64_t cycle) {
-    __unused (cycle);
-
    // handle dcache response
    for (uint32_t t = 0; t < num_threads_; ++t) {
        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
        if (dcache_rsp_port.empty())
            continue;
-        auto& mem_rsp = dcache_rsp_port.top();
+        auto& mem_rsp = dcache_rsp_port.front();
        auto& entry = pending_dcache_.at(mem_rsp.tag);          
        auto trace = entry.first;
        DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
            << ", tid=" << t << ", " << *trace);  
        assert(entry.second);
        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {        
-            auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
-            trace->dcache_latency = latency;
-            this->schedule_output(trace, 1);
+        if (0 == entry.second) {
+            Output.send(trace, 1);
            pending_dcache_.release(mem_rsp.tag);
        } 
        dcache_rsp_port.pop();  
    }

+    // handle shared memory response
+    for (uint32_t t = 0; t < num_threads_; ++t) {
+        auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
+        if (smem_rsp_port.empty())
+            continue;
+        auto& mem_rsp = smem_rsp_port.front();
+        auto& entry = pending_dcache_.at(mem_rsp.tag);          
+        auto trace = entry.first;
+        DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
+            << ", tid=" << t << ", " << *trace);  
+        assert(entry.second);
+        --entry.second; // track remaining blocks 
+        if (0 == entry.second) {
+            Output.send(trace, 1);
+            pending_dcache_.release(mem_rsp.tag);
+        } 
+        smem_rsp_port.pop();  
+    }
+
    if (fence_lock_) {
        // wait for all pending memory operations to complete
        if (!pending_dcache_.empty())
            return;
-        this->schedule_output(fence_state_, 1);
+        Output.send(fence_state_, 1);
        fence_lock_ = false;
        DT(3, cycle, "fence-unlock: " << fence_state_);
    }

    // check input queue
-    if (inputs_.empty())
+    if (Input.empty())
        return;

-    auto trace = inputs_.top();
+    auto trace = Input.front();

    if (trace->lsu.type == LsuType::FENCE) {
        // schedule fence lock
@@ -75,179 +89,188 @@ void LsuUnit::step(uint64_t cycle) {
        fence_lock_ = true;        
        DT(3, cycle, "fence-lock: " << *trace);
        // remove input
-        inputs_.pop(); 
+        auto time = Input.pop(); 
+        core_->perf_stats_.lsu_stalls += (cycle - time);
        return;
    }

-    // check pending queue capacity
-    if (!trace->check_stalled(pending_dcache_.full())) {
-        DT(3, cycle, "*** lsu-queue-stall: " << *trace);
-    }
-    if (pending_dcache_.full())
+    // check pending queue capacity    
+    if (pending_dcache_.full()) {
+        if (!trace->suspend()) {
+            DT(3, cycle, "*** lsu-queue-stall: " << *trace);
+        }
        return;
-
-    // send memory request
-
-    bool has_shared_memory = false;
-    bool mem_rsp_pending = false;    
+    } else {
+        trace->resume();
+    }
+    
    bool is_write = (trace->lsu.type == LsuType::STORE);

-    uint32_t valid_addrs = 0;
-    for (auto& mem_addr : trace->mem_addrs) {
-        valid_addrs += mem_addr.size();
-    }    
+    // duplicates detection
+    bool is_dup = false;
+    if (trace->tmask.test(0)) {
+        uint64_t addr_mask = sizeof(Word)-1;
+        Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
+        uint32_t matches = 1;
+        for (uint32_t t = 1; t < num_threads_; ++t) {
+            if (!trace->tmask.test(t))
+                continue;
+            auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
+            matches += (addr0 == mem_addr);
+        }
+        is_dup = (matches == trace->tmask.count());
+    }
+
+    uint32_t valid_addrs = 0;
+    if (is_dup) {
+        valid_addrs = 1;
+    } else {
+        for (auto& mem_addr : trace->mem_addrs) {
+            valid_addrs += mem_addr.size();
+        }
+    }

-    trace->dcache_latency = SimPlatform::instance().cycles();
    auto tag = pending_dcache_.allocate({trace, valid_addrs});

    for (uint32_t t = 0; t < num_threads_; ++t) {
        if (!trace->tmask.test(t))
            continue;
+        
+        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);        
+        auto mem_addr = trace->mem_addrs.at(t).at(0);
+        auto type = get_addr_type(mem_addr.addr, mem_addr.size);

-        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
-        for (auto mem_addr : trace->mem_addrs.at(t)) {
-            // check shared memory address
-            if (SM_ENABLE) {
-                if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE))
-                && (mem_addr < SMEM_BASE_ADDR)) {
-                    DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag 
-                        << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
-                    has_shared_memory = true;
-                    continue;
-                }
-            }
-
-            bool is_io = (mem_addr >= IO_BASE_ADDR);
-
-            MemReq mem_req;
-            mem_req.addr  = mem_addr;
-            mem_req.write = is_write;
-            mem_req.tag   = tag;
-            mem_req.is_io = is_io; 
-            dcache_req_port.send(mem_req, 1);
-            DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag 
-                << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace);            
-            // do not wait on writes
-            mem_rsp_pending = !is_write;
-        }
+        MemReq mem_req;
+        mem_req.addr  = mem_addr.addr;
+        mem_req.write = is_write;
+        mem_req.tag   = tag;
+        mem_req.is_io = (type == AddrType::IO); 
+        
+        if (type == AddrType::Shared) {
+            core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
+            DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
+                << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
+        } else {            
+            dcache_req_port.send(mem_req, 2);
+            DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
+                << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace);
+        }        
+        
+        if (is_dup)
+            break;
    }

-    // do not wait 
-    if (!mem_rsp_pending) {        
+    // do not wait on writes
+    if (is_write) {        
        pending_dcache_.release(tag);
-        uint32_t delay = 1;
-        if (has_shared_memory) {
-            // all threads accessed shared memory
-            delay += Constants::SMEM_DELAY;
-        }
-        this->schedule_output(trace, delay);
+        Output.send(trace, 1);
    }

    // remove input
-    inputs_.pop();
+    auto time = Input.pop();
+    core_->perf_stats_.lsu_stalls += (cycle - time);
 }

 ///////////////////////////////////////////////////////////////////////////////

-AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
+AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
    
-void AluUnit::step(uint64_t /*cycle*/) {    
-    if (inputs_.empty())
+void AluUnit::step(uint64_t cycle) {    
+    if (Input.empty())
        return;
-    auto trace = inputs_.top();    
+    auto trace = Input.front();    
    switch (trace->alu.type) {
    case AluType::ARITH:        
    case AluType::BRANCH:
    case AluType::CMOV:
-        this->schedule_output(trace, 1);
-        inputs_.pop();
+        Output.send(trace, 1);
        break;
    case AluType::IMUL:
-        this->schedule_output(trace, LATENCY_IMUL);
-        inputs_.pop();
+        Output.send(trace, LATENCY_IMUL+1);
        break;
    case AluType::IDIV:
-        this->schedule_output(trace, XLEN);
-        inputs_.pop();
+        Output.send(trace, XLEN+1);
        break;
    default:
        std::abort();
    }
+    DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
+    if (trace->fetch_stall) {
+        core_->stalled_warps_.reset(trace->wid);
+    }
+    auto time = Input.pop();
+    core_->perf_stats_.alu_stalls += (cycle - time);
 }

 ///////////////////////////////////////////////////////////////////////////////

-CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
+CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
    
-void CsrUnit::step(uint64_t /*cycle*/) {
-    if (inputs_.empty()) 
+void CsrUnit::step(uint64_t cycle) {
+    if (Input.empty()) 
        return;
-    auto trace = inputs_.top();
-    this->schedule_output(trace, 1);
-    inputs_.pop();
+    auto trace = Input.front();
+    Output.send(trace, 1);
+    auto time = Input.pop();
+    core_->perf_stats_.csr_stalls += (cycle - time);
+    DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
 }

 ///////////////////////////////////////////////////////////////////////////////

-FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
+FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
    
-void FpuUnit::step(uint64_t /*cycle*/) {
-    if (inputs_.empty()) 
+void FpuUnit::step(uint64_t cycle) {
+    if (Input.empty()) 
        return;
-    auto trace = inputs_.top();
+    auto trace = Input.front();
    switch (trace->fpu.type) {
    case FpuType::FNCP:
-        this->schedule_output(trace, 1);
-        inputs_.pop();
+        Output.send(trace, 2);
        break;
    case FpuType::FMA:
-        this->schedule_output(trace, LATENCY_FMA);
-        inputs_.pop();
+        Output.send(trace, LATENCY_FMA+1);
        break;
    case FpuType::FDIV:
-        this->schedule_output(trace, LATENCY_FDIV);
-        inputs_.pop();
+        Output.send(trace, LATENCY_FDIV+1);
        break;
    case FpuType::FSQRT:
-        this->schedule_output(trace, LATENCY_FSQRT);
-        inputs_.pop();
+        Output.send(trace, LATENCY_FSQRT+1);
        break;
    case FpuType::FCVT:
-        this->schedule_output(trace, LATENCY_FCVT);
-        inputs_.pop();
+        Output.send(trace, LATENCY_FCVT+1);
        break;
    default:
        std::abort();
-    }
+    }    
+    DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
+    auto time = Input.pop();
+    core_->perf_stats_.fpu_stalls += (cycle - time);
 }

 ///////////////////////////////////////////////////////////////////////////////

-GpuUnit::GpuUnit(Core* core) 
-    : ExeUnit("GPU") 
-    , core_(core)
+GpuUnit::GpuUnit(const SimContext& ctx, Core* core) 
+    : ExeUnit(ctx, core, "GPU")
    , num_threads_(core->arch().num_threads()) 
    , pending_tex_reqs_(TEXQ_SIZE)
 {}
    
 void GpuUnit::step(uint64_t cycle) {
-    __unused (cycle);
 #ifdef EXT_TEX_ENABLE
    // handle memory response
    for (uint32_t t = 0; t < num_threads_; ++t) {
        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
        if (dcache_rsp_port.empty())
            continue;
-        auto& mem_rsp = dcache_rsp_port.top();
+        auto& mem_rsp = dcache_rsp_port.front();
        auto& entry = pending_tex_reqs_.at(mem_rsp.tag);  
        auto trace = entry.first;
        DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);  
        assert(entry.second);
        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {             
-            auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
-            trace->dcache_latency = latency;
-            this->schedule_output(trace, 1);
+        if (0 == entry.second) {
+            Output.send(trace, 1);
            pending_tex_reqs_.release(mem_rsp.tag);
        }   
        dcache_rsp_port.pop();
@@ -255,38 +278,67 @@ void GpuUnit::step(uint64_t cycle) {
 #endif

    // check input queue
-    if (inputs_.empty())
+    if (Input.empty())
        return;

-    auto trace = inputs_.top();
+    auto trace = Input.front();
+
+    bool issued = false;

    switch  (trace->gpu.type) {
    case GpuType::TMC:
+        Output.send(trace, 1);
+        core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
+        issued = true;
+        break;
    case GpuType::WSPAWN:
+        Output.send(trace, 1);
+        core_->active_warps_ = trace->gpu.active_warps;        
+        issued = true;
+        break;
    case GpuType::SPLIT:
    case GpuType::JOIN:
-    case GpuType::BAR:
-        this->schedule_output(trace, 1);
-        inputs_.pop();
+        Output.send(trace, 1);
+        issued = true;
        break;
-    case GpuType::TEX: {
+    case GpuType::BAR:
+        Output.send(trace, 1);
+        if (trace->gpu.active_warps != 0) 
+            core_->active_warps_ |= trace->gpu.active_warps;
+        else
+            core_->active_warps_.reset(trace->wid);
+        issued = true;
+        break;
+    case GpuType::TEX:
        if (this->processTexRequest(cycle, trace))
-            inputs_.pop();
-    }   break;
+           issued = true;
+        break;
    default:
        std::abort();
    }
+
+    if (issued) {    
+        DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
+        if (trace->fetch_stall)  {
+            core_->stalled_warps_.reset(trace->wid);
+        }
+        auto time = Input.pop();
+        core_->perf_stats_.fpu_stalls += (cycle - time);
+    }
 }

 bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
    __unused (cycle);
    
-    // check pending queue capacity
-    if (!trace->check_stalled(pending_tex_reqs_.full())) {
-        DT(3, cycle, "*** tex-queue-stall: " << *trace);
-    }
-    if (pending_tex_reqs_.full())
+    // check pending queue capacity    
+    if (pending_tex_reqs_.full()) {
+        if (!trace->suspend()) {
+            DT(3, cycle, "*** tex-queue-stall: " << *trace);
+        }
        return false;
+    } else {
+        trace->resume();
+    }

    // send memory request

@@ -295,7 +347,6 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
        valid_addrs += mem_addr.size();
    }

-    trace->tex_latency = SimPlatform::instance().cycles();
    auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});

    for (uint32_t t = 0; t < num_threads_; ++t) {
@@ -305,12 +356,14 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
        for (auto mem_addr : trace->mem_addrs.at(t)) {
            MemReq mem_req;
-            mem_req.addr  = mem_addr;
+            mem_req.addr  = mem_addr.addr;
            mem_req.write = (trace->lsu.type == LsuType::STORE);
            mem_req.tag   = tag;
-            dcache_req_port.send(mem_req, 1);
-            DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag 
+            dcache_req_port.send(mem_req, 3);
+            DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
                << ", tid=" << t << ", "<< trace);
+            ++ core_->perf_stats_.tex_reads;
+            ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
        }
    }