// Copyright © 2019-2023
// 
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// 
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "cluster.h"

using namespace vortex;

Cluster::Cluster(const SimContext& ctx, 
                 uint32_t cluster_id,
                 ProcessorImpl* processor, 
                 const Arch &arch, const 
                 DCRS &dcrs) 
  : SimObject(ctx, "cluster")
  , mem_req_port(this)
  , mem_rsp_port(this)
  , cluster_id_(cluster_id)
  , cores_(arch.num_cores())  
  , barriers_(arch.num_barriers(), 0)
  , sharedmems_(arch.num_cores())
  , processor_(processor)
{
  auto num_cores = arch.num_cores();
  
  char sname[100];
  snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
  l2cache_ = CacheSim::Create(sname, CacheSim::Config{
    !L2_ENABLED,
    log2ceil(L2_CACHE_SIZE), // C
    log2ceil(MEM_BLOCK_SIZE), // B
    log2ceil(L2_NUM_WAYS),  // W
    0,                      // A
    XLEN,                   // address bits  
    L2_NUM_BANKS,           // number of banks
    1,                      // number of ports
    5,                      // request size 
    true,                   // write-through
    false,                  // write response
    0,                      // victim size
    L2_MSHR_SIZE,           // mshr
    2,                      // pipeline latency
  });

  l2cache_->MemReqPort.bind(&this->mem_req_port);
  this->mem_rsp_port.bind(&l2cache_->MemRspPort);

  snprintf(sname, 100, "cluster%d-icaches", cluster_id);
  icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
    !ICACHE_ENABLED,
    log2ceil(ICACHE_SIZE),  // C
    log2ceil(L1_LINE_SIZE), // B
    log2ceil(sizeof(uint32_t)), // W
    log2ceil(ICACHE_NUM_WAYS),// A
    XLEN,                   // address bits    
    1,                      // number of banks
    1,                      // number of ports
    1,                      // number of inputs
    true,                   // write-through
    false,                  // write response
    0,                      // victim size
    (uint8_t)arch.num_warps(), // mshr
    2,                      // pipeline latency
  });

  icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
  l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);

  snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
  dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
    !DCACHE_ENABLED,
    log2ceil(DCACHE_SIZE),  // C
    log2ceil(L1_LINE_SIZE), // B
    log2ceil(sizeof(Word)), // W
    log2ceil(DCACHE_NUM_WAYS),// A
    XLEN,                   // address bits    
    DCACHE_NUM_BANKS,       // number of banks
    1,                      // number of ports
    DCACHE_NUM_BANKS,       // number of inputs
    true,                   // write-through
    false,                  // write response
    0,                      // victim size
    DCACHE_MSHR_SIZE,       // mshr
    4,                      // pipeline latency
  });

  dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
  l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);

  ///////////////////////////////////////////////////////////////////////////

  // create shared memory blocks
  for (uint32_t i = 0; i < num_cores; ++i) {
    snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
    sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
      (1 << SMEM_LOG_SIZE),
      sizeof(Word),
      NUM_LSU_LANES, 
      NUM_LSU_LANES,
      false
    });
  }

  // create cores

  for (uint32_t i = 0; i < num_cores; ++i) {  
    uint32_t core_id = cluster_id * num_cores + i;
    cores_.at(i) = Core::Create(core_id, 
                                this, 
                                arch, 
                                dcrs, 
                                sharedmems_.at(i));

    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      

    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
      snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
      auto smem_demux = SMemDemux::Create(sname);
      
      cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
      smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));        
      
      smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
      dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);

      smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
      sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
    }
  }
}

Cluster::~Cluster() {
  //--
}

void Cluster::reset() {  
  for (auto& barrier : barriers_) {
    barrier.reset();
  }
}

void Cluster::tick() {
  //--
}

void Cluster::attach_ram(RAM* ram) {
  for (auto core : cores_) {
    core->attach_ram(ram);
  }
}

bool Cluster::running() const {
  for (auto& core : cores_) {
    if (core->running())
      return true;
  }
  return false;
}

bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
  bool done = true;
  Word exitcode_ = 0;
  for (auto& core : cores_) {
    Word ec;
    if (core->check_exit(&ec, riscv_test)) {
      exitcode_ |= ec;
    } else {
      done = false;
    }
  }
  *exitcode = exitcode_;
  return done;
}

void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
  auto& barrier = barriers_.at(bar_id);

  uint32_t local_core_id = core_id % cores_.size();
  barrier.set(local_core_id);

  DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);

  if (barrier.count() == (size_t)count) {
      // resume all suspended cores
      for (uint32_t i = 0; i < cores_.size(); ++i) {
        if (barrier.test(i)) {
          DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
          cores_.at(i)->resume();
        }
      }
      barrier.reset();
    }
}

ProcessorImpl* Cluster::processor() const {
  return processor_;
}

Cluster::PerfStats Cluster::perf_stats() const {
  Cluster::PerfStats perf;
  perf.icache = icaches_->perf_stats();
  perf.dcache = dcaches_->perf_stats();
  perf.l2cache = l2cache_->perf_stats();

  for (auto sharedmem : sharedmems_) {
    perf.sharedmem += sharedmem->perf_stats();
  }
  
  return perf;
}