From 092ff42ab42bbf20aeaba4788c0ee73fd3d22fda Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 1 Dec 2021 00:12:16 -0500 Subject: [PATCH] simx multicore fix --- ci/regression.sh | 7 +++++++ driver/simx/vortex.cpp | 5 ++--- sim/simx/constants.h | 2 ++ sim/simx/core.cpp | 15 +++++++++++++-- sim/simx/core.h | 1 + sim/simx/main.cpp | 3 +-- sim/simx/processor.cpp | 38 +++++++++++++++++++++----------------- 7 files changed, 47 insertions(+), 24 deletions(-) diff --git a/ci/regression.sh b/ci/regression.sh index 2be58140..b6125ce1 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -43,15 +43,20 @@ echo "begin clustering tests..." # warp/threads configurations ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo +./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=demo # cores clustering ./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1" +./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=demo --args="-n1" # L2/L3 ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=demo --args="-n1" echo "clustering tests done!" } @@ -101,12 +106,14 @@ CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsi # test cache banking CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr +CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=io_addr # test cache multi-porting CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --debug --args="-n1" CONFIGS="-DL2_NUM_PORTS=2 -DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=io_addr +CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=io_addr # test 128-bit MEM block CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index d63005d6..2aaef1e9 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -9,11 +9,10 @@ #include #include #include +#include #include #include -#define RAM_PAGE_SIZE 4096 - using namespace vortex; /////////////////////////////////////////////////////////////////////////////// @@ -58,7 +57,7 @@ private: class vx_device { public: vx_device() - : arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS) + : arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS) , ram_(RAM_PAGE_SIZE) , mem_allocation_(ALLOC_BASE_ADDR) {} diff --git a/sim/simx/constants.h b/sim/simx/constants.h index b173a03f..7d8daed5 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -6,6 +6,8 @@ #define MEM_LATENCY 24 #endif +#define RAM_PAGE_SIZE 4096 + namespace vortex { enum Constants { diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 7c6cbffa..934ce1f8 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -21,6 +21,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , arch_(arch) , decoder_(arch) , mmu_(0, arch.wsize(), true) + , smem_(RAM_PAGE_SIZE) , tex_units_(NUM_TEX_UNITS, this) , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) @@ -380,7 +381,12 @@ Word Core::icache_read(Addr addr, Size size) { Word Core::dcache_read(Addr addr, Size size) { Word data; - mmu_.read(&data, addr, size, 0); + auto type = get_addr_type(addr, size); + if (type == AddrType::Shared) { + smem_.read(&data, addr & (SMEM_SIZE-1), size); + } else { + mmu_.read(&data, addr, size, 0); + } return data; } @@ -389,7 +395,12 @@ void Core::dcache_write(Addr addr, Word data, Size size) { && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { this->writeToStdOut(addr, data); } else { - mmu_.write(&data, addr, size, 0); + auto type = get_addr_type(addr, size); + if (type == AddrType::Shared) { + smem_.write(&data, addr & (SMEM_SIZE-1), size); + } else { + mmu_.write(&data, addr, size, 0); + } } } diff --git a/sim/simx/core.h b/sim/simx/core.h index e4a6034e..b9c01383 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -137,6 +137,7 @@ private: const ArchDef arch_; const Decoder decoder_; MemoryUnit mmu_; + RAM smem_; std::vector tex_units_; std::vector> warps_; diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index a0e07faf..86829f3a 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -6,11 +6,10 @@ #include #include #include "processor.h" +#include "constants.h" #include #include "args.h" -#define RAM_PAGE_SIZE 4096 - using namespace vortex; int main(int argc, char **argv) { diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index ca9d46a6..f069a6b7 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -18,8 +18,9 @@ Processor::Processor(const ArchDef& arch) // connect memory sub-systen memsim_ = MemSim::Create(1, MEM_LATENCY); - std::vector*> mem_req_ports(1); + std::vector*> mem_req_ports(1); std::vector*> mem_rsp_ports(1); + mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); @@ -46,6 +47,7 @@ Processor::Processor(const ArchDef& arch) mem_req_ports.resize(NUM_CLUSTERS); mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); @@ -57,13 +59,17 @@ Processor::Processor(const ArchDef& arch) mem_req_ports.resize(NUM_CLUSTERS); mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); } } - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + std::vector*> cluster_mem_req_ports(cores_per_cluster); + std::vector*> cluster_mem_rsp_ports(cores_per_cluster); + if (L2_ENABLE) { auto& l2cache = l2caches_.at(i); l2cache = Cache::Create("l2cache", Cache::Config{ @@ -74,40 +80,38 @@ Processor::Processor(const ArchDef& arch) 32, // address bits L2_NUM_BANKS, // number of banks L2_NUM_PORTS, // number of ports - NUM_CORES, // request size + (uint8_t)cores_per_cluster, // request size true, // write-through false, // write response 0, // victim size L2_MSHR_SIZE, // mshr 2, // pipeline latency }); + mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); l2cache->MemReqPort.bind(mem_req_ports.at(i)); - mem_req_ports.resize(cores_per_cluster); - mem_rsp_ports.resize(cores_per_cluster); for (uint32_t j = 0; j < cores_per_cluster; ++j) { - mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); - mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); + cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); } - } else if (cores_per_cluster > 1) { + } else { auto& l2_mem_switch = l2_mem_switches_.at(i); - l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES); - mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); - l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster); + + mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); + l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); - mem_req_ports.resize(cores_per_cluster); - mem_rsp_ports.resize(cores_per_cluster); for (uint32_t j = 0; j < cores_per_cluster; ++j) { - mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); - mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); + cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); } } for (uint32_t j = 0; j < cores_per_cluster; ++j) { auto& core = cores_.at((i * NUM_CLUSTERS) + j); - mem_rsp_ports.at(i)->bind(&core->MemRspPort); - core->MemReqPort.bind(mem_req_ports.at(j)); + cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort); + core->MemReqPort.bind(cluster_mem_req_ports.at(j)); } } }