SimX timing simulation

2021-11-13 01:41:12 -05:00
parent 009e897cab
commit c2721fd545
26 changed files with 3690 additions and 1639 deletions
--- a/sim/simX/core.cpp
+++ b/sim/simX/core.cpp
@@ -12,34 +12,92 @@

 using namespace vortex;

-Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
-    : id_(id)
+Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
+    : SimObject(ctx, "Core")
+    , id_(id)
    , arch_(arch)
    , decoder_(decoder)
    , mem_(mem)
    , shared_mem_(1, SMEM_SIZE)
-    , inst_in_schedule_("schedule")
-    , inst_in_fetch_("fetch")
-    , inst_in_decode_("decode")
-    , inst_in_issue_("issue")
-    , inst_in_execute_("execute")
-    , inst_in_writeback_("writeback") {
-  in_use_iregs_.resize(arch.num_warps(), 0);
-  in_use_fregs_.resize(arch.num_warps(), 0);
-  in_use_vregs_.reset();
-
-  csrs_.resize(arch_.num_csrs(), 0);
-
-  fcsrs_.resize(arch_.num_warps(), 0);
-
-  barriers_.resize(arch_.num_barriers(), 0);
-
-  warps_.resize(arch_.num_warps());
+    , warps_(arch.num_warps())
+    , barriers_(arch.num_barriers(), 0)
+    , csrs_(arch.num_csrs(), 0)
+    , fcsrs_(arch.num_warps(), 0)
+    , ibuffers_(arch.num_warps(), IBUF_SIZE)
+    , scoreboard_(arch_) 
+    , exe_units_((int)ExeType::MAX)
+    , icache_(Cache::Create("Icache", CacheConfig{
+        log2ceil(ICACHE_SIZE),  // C
+        log2ceil(L1_BLOCK_SIZE),// B
+        2,                      // W
+        0,                      // A
+        32,                     // address bits    
+        1,                      // number of banks
+        1,                      // number of ports
+        1,                      // request size   
+        true,                   // write-throught
+        0,                      // victim size
+        NUM_WARPS,              // mshr
+        2,                      // pipeline latency
+      }))
+    , dcache_(Cache::Create("Dcache", CacheConfig{
+        log2ceil(DCACHE_SIZE),  // C
+        log2ceil(L1_BLOCK_SIZE),// B
+        2,                      // W
+        0,                      // A
+        32,                     // address bits    
+        DCACHE_NUM_BANKS,       // number of banks
+        DCACHE_NUM_PORTS,       // number of ports
+        (uint8_t)arch.num_threads(), // request size   
+        true,                   // write-throught
+        0,                      // victim size
+        DCACHE_MSHR_SIZE,       // mshr
+        2,                      // pipeline latency
+      }))
+    , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2)) 
+    , icache_rsp_port_(this, this, &Core::icache_handleCacheReponse)
+    , dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast<LsuUnit*>(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse})
+    , fetch_stage_("fetch")
+    , decode_stage_("decode")
+    , issue_stage_("issue")
+    , execute_stage_("execute")
+    , commit_stage_("writeback")
+    , pending_icache_(arch_.num_warps())
+    , stalled_warps_(0)
+    , last_schedule_wid_(0)
+    , pending_instrs_(0)
+    , ebreak_(false)   
+    , stats_insts_(0)
+    , stats_loads_(0)
+    , stats_stores_(0)
+    , MemRspPort(this, &l1_mem_switch_->RspIn)
+    , MemReqPort(this, &l1_mem_switch_->ReqOut)    
+{  
  for (int i = 0; i < arch_.num_warps(); ++i) {
-    warps_[i] = std::make_shared<Warp>(this, i);
+    warps_.at(i) = std::make_shared<Warp>(this, i);
  }

-  this->clear();
+  // register execute units
+  exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
+  exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
+  exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
+  exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);  
+  exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);
+
+  // connect l1 caches
+  icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_);
+  for (int i = 0; i < arch_.num_threads(); ++i) {
+    dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i));
+  }
+
+  // connect l1 switch
+  icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
+  dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]);
+  l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort);  
+  l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort);
+
+  // activate warp0
+  warps_.at(0)->setTmask(0, true);
 }

 Core::~Core() {
@@ -51,194 +109,164 @@ Core::~Core() {
  }
 }

-void Core::clear() {
-  for (int w = 0; w < arch_.num_warps(); ++w) {    
-    in_use_iregs_[w].reset();
-    in_use_fregs_[w].reset();    
-  }
-  stalled_warps_.reset();
-
-  in_use_vregs_.reset();
-  
-  for (auto& csr : csrs_) {
-    csr = 0;
-  }
-
-  for (auto& fcsr : fcsrs_) {
-    fcsr = 0;
-  }
-
-  for (auto& barrier : barriers_) {
-    barrier.reset();
-  }
-  
-  for (auto warp : warps_) {
-    warp->clear();
-  }  
-
-  inst_in_schedule_.clear();
-  inst_in_fetch_.clear();
-  inst_in_decode_.clear();
-  inst_in_issue_.clear();
-  inst_in_execute_.clear();
-  inst_in_writeback_.clear();
-  print_bufs_.clear();
-
-  steps_  = 0;
-  insts_  = 0;
-  loads_  = 0;
-  stores_ = 0;
-
-  inst_in_schedule_.valid = true;
-  warps_[0]->setTmask(0, true);
-
-  ebreak_ = false;
+void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) {
+  // advance to decode stage
+  uint32_t wid = response.tag;
+  pipeline_state_t state;
+  pending_icache_.remove(wid, &state);
+  auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
+  state.icache_latency = latency;
+  decode_stage_.push(state);
 }

-void Core::step() {
+void Core::step(uint64_t cycle) {
+    __unused (cycle);
  D(2, "###########################################################");
+  D(2, std::dec << "Core" << id_ << ": cycle: " << cycle);

-  steps_++;
-  D(2, std::dec << "Core" << id_ << ": cycle: " << steps_);
-
-  this->writeback();
+  this->commit();
  this->execute();
  this->issue();
  this->decode();
  this->fetch();
-  this->schedule();

  DPN(2, std::flush);
 }

-void Core::schedule() {
-  if (!inst_in_schedule_.enter(&inst_in_fetch_))
-    return;
-
+void Core::warp_scheduler() {
  bool foundSchedule = false;
-  int scheduled_warp = inst_in_schedule_.wid;
+  int scheduled_warp = last_schedule_wid_;

-  for (size_t wid = 0; wid < warps_.size(); ++wid) {
-    // round robin scheduling
+  // round robin scheduling
+  for (size_t wid = 0; wid < warps_.size(); ++wid) {    
    scheduled_warp = (scheduled_warp + 1) % warps_.size();
-    bool is_active = warps_[scheduled_warp]->active();
-    bool stalled = stalled_warps_[scheduled_warp];
-    if (is_active && !stalled) {
+    bool warp_active  = warps_.at(scheduled_warp)->active();
+    bool warp_stalled = stalled_warps_.test(scheduled_warp); 
+    if (warp_active && !warp_stalled) {      
+      last_schedule_wid_ = scheduled_warp;
      foundSchedule = true;
      break;
    }
  }

  if (!foundSchedule)
-    return;
+    return;  

-  D(2, "Schedule: wid=" << scheduled_warp);
-  inst_in_schedule_.wid = scheduled_warp;
+  // suspend warp until decode
+  stalled_warps_.set(scheduled_warp);

-  // advance pipeline
-  inst_in_schedule_.next(&inst_in_fetch_);
+  auto& warp = warps_.at(scheduled_warp);  
+  stats_insts_ += warp->getActiveThreads();
+  
+  pipeline_state_t state;
+  warp->eval(&state);
+
+  D(4, state);  
+
+  // advance to fetch stage
+  ++pending_instrs_;
+  fetch_stage_.push(state);
 }

 void Core::fetch() {
-  if (!inst_in_fetch_.enter(&inst_in_issue_))
-    return;
+  // schedule icache request
+  pipeline_state_t state;
+  if (fetch_stage_.try_pop(&state)) {
+    state.icache_latency = SimPlatform::instance().cycles();
+    MemReq mem_req;
+    mem_req.addr  = state.PC;
+    mem_req.write = false;
+    mem_req.tag   = pending_icache_.allocate(state);    
+    icache_->CoreReqPorts.at(0).send(mem_req, 1);
+  }  

-  int wid = inst_in_fetch_.wid;
-  
-  auto active_threads_b = warps_[wid]->getActiveThreads();    
-  warps_[wid]->step(&inst_in_fetch_);
-  auto active_threads_a = warps_[wid]->getActiveThreads();   
-
-  insts_ += active_threads_b;
-  if (active_threads_b != active_threads_a) {
-    D(3, "*** warp#" << wid << " active threads changed to " << active_threads_a);
-  }
-
-  if (inst_in_fetch_.stall_warp) {
-    D(3, "*** warp#" << wid << " fetch stalled");
-    stalled_warps_[wid] = true;
-  }
-  
-  D(4, inst_in_fetch_);
-
-  // advance pipeline
-  inst_in_fetch_.next(&inst_in_issue_);
+  // schedule next warp
+  this->warp_scheduler();  
 }

 void Core::decode() {
-  if (!inst_in_decode_.enter(&inst_in_issue_))
-    return;
+  pipeline_state_t state;
+  if (!decode_stage_.try_pop(&state))
+    return;    
  
-  // advance pipeline
-  inst_in_decode_.next(&inst_in_issue_);
+  if (state.stall_warp) {
+    D(3, "*** warp#" << state.wid << " fetch stalled");
+  } else {
+    // release warp
+    stalled_warps_.reset(state.wid);
+  }
+  
+  // advance to issue stage
+  issue_stage_.push(state);
 }

 void Core::issue() {
-  if (!inst_in_issue_.enter(&inst_in_execute_))
-    return;
+  if (!issue_stage_.empty()) {
+    // insert to ibuffer 
+    auto& state = issue_stage_.top();
+    auto& ibuffer = ibuffers_.at(state.wid);
+    if (!ibuffer.full()) {
+      ibuffer.push(state);
+      issue_stage_.pop();
+    }
+  }
+    
+  // issue ibuffer instructions
+  for (auto& ibuffer : ibuffers_) {
+    if (ibuffer.empty())
+      continue;

-  bool in_use_regs = (inst_in_issue_.used_iregs & in_use_iregs_[inst_in_issue_.wid]) != 0 
-                  || (inst_in_issue_.used_fregs & in_use_fregs_[inst_in_issue_.wid]) != 0 
-                  || (inst_in_issue_.used_vregs & in_use_vregs_) != 0;
+    auto& state = ibuffer.top();
+
+    // check scoreboard
+    if (scoreboard_.in_use(state))
+      continue;
+
+    // update scoreboard
+    scoreboard_.reserve(state);
+
+    // advance to execute stage
+    execute_stage_.push(state);
+
+    ibuffer.pop();
+    break;
+  }
+}
+
+void Core::execute() {
+  // process stage inputs
+  if (!execute_stage_.empty()) {
+    auto& state = execute_stage_.top();
+    auto& exe_unit = exe_units_.at((int)state.exe_type);
+    exe_unit->push_input(state);
+    execute_stage_.pop();
+  }
+
+  // advance execute units
+  for (auto& exe_unit : exe_units_) {
+    exe_unit->step();
+  }  
  
-  if (in_use_regs) {      
-    D(3, "*** Issue: registers not ready!");
-    inst_in_issue_.stalled = true;
-    return;
-  } 
-
-  switch (inst_in_issue_.rdest_type) {
-  case 1:
-    if (inst_in_issue_.rdest)
-      in_use_iregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1;
-    break;
-  case 2:
-    in_use_fregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1;
-    break;
-  case 3:
-    in_use_vregs_[inst_in_issue_.rdest] = 1;
-    break;
-  default:  
-    break;
+  // commit completed instructions
+  for (auto& exe_unit : exe_units_) {
+    pipeline_state_t state;
+    if (exe_unit->pop_output(&state)) {
+      if (state.stall_warp) {
+        stalled_warps_.reset(state.wid);
+      }
+      // advance to commit stage
+      commit_stage_.push(state);      
+    }
  }
-
-  // advance pipeline
-  inst_in_issue_.next(&inst_in_execute_);
 }

-void Core::execute() {  
-  if (!inst_in_execute_.enter(&inst_in_writeback_))
+void Core::commit() {
+  pipeline_state_t state;
+  if (!commit_stage_.try_pop(&state))
    return;

-  // advance pipeline
-  inst_in_execute_.next(&inst_in_writeback_);
-}
-
-void Core::writeback() {
-  if (!inst_in_writeback_.enter(NULL))
-    return;
-
-  switch (inst_in_writeback_.rdest_type) {
-  case 1:
-    in_use_iregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0;
-    break;
-  case 2:
-    in_use_fregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0;
-    break;
-  case 3:
-    in_use_vregs_[inst_in_writeback_.rdest] = 0;
-    break;
-  default:  
-    break;
-  }
-
-  if (inst_in_writeback_.stall_warp) {
-    stalled_warps_[inst_in_writeback_.wid] = false;
-    D(3, "*** warp#" << inst_in_writeback_.wid << " fetch released");
-  }
-
-  // advance pipeline
-  inst_in_writeback_.next(NULL);
+  // update scoreboard
+  scoreboard_.release(state);
 }

 Word Core::get_csr(Addr addr, int tid, int wid) {
@@ -281,16 +309,16 @@ Word Core::get_csr(Addr addr, int tid, int wid) {
    return arch_.num_cores();
  } else if (addr == CSR_MINSTRET) {
    // NumInsts
-    return insts_;
+    return stats_insts_;
  } else if (addr == CSR_MINSTRET_H) {
    // NumInsts
-    return (Word)(insts_ >> 32);
+    return (Word)(stats_insts_ >> 32);
  } else if (addr == CSR_MCYCLE) {
    // NumCycles
-    return (Word)steps_;
+    return (Word)SimPlatform::instance().cycles();
  } else if (addr == CSR_MCYCLE_H) {
    // NumCycles
-    return (Word)(steps_ >> 32);
+    return (Word)(SimPlatform::instance().cycles() >> 32);
  } else {
    return csrs_.at(addr);
  }
@@ -328,7 +356,7 @@ Word Core::icache_fetch(Addr addr) {
 }

 Word Core::dcache_read(Addr addr, Size size) {
-  ++loads_;
+  ++stats_loads_;
  Word data = 0;
 #ifdef SM_ENABLE
  if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
@@ -342,7 +370,7 @@ Word Core::dcache_read(Addr addr, Size size) {
 }

 void Core::dcache_write(Addr addr, Word data, Size size) {
-  ++stores_;
+  ++stats_stores_;
 #ifdef SM_ENABLE
  if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
   && ((addr + 3) < SMEM_BASE_ADDR)) {
@@ -359,23 +387,19 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
 }

 bool Core::running() const {
-  return inst_in_fetch_.valid 
-      || inst_in_decode_.valid 
-      || inst_in_issue_.valid 
-      || inst_in_execute_.valid 
-      || inst_in_writeback_.valid;
+  return pending_instrs_;
 }

 void Core::printStats() const {
-  std::cout << "Steps : " << steps_ << std::endl
-            << "Insts : " << insts_ << std::endl
-            << "Loads : " << loads_ << std::endl
-            << "Stores: " << stores_ << std::endl;
+  std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl
+            << "Insts : " << stats_insts_ << std::endl
+            << "Loads : " << stats_loads_ << std::endl
+            << "Stores: " << stats_stores_ << std::endl;
 }

 void Core::writeToStdOut(Addr addr, Word data) {
  uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
-  auto& ss_buf = print_bufs_[tid];
+  auto& ss_buf = print_bufs_.at(tid);
  char c = (char)data;
  ss_buf << c;
  if (c == '\n') {