Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -1,45 +1,36 @@
+XLEN ?= 32
 DESTDIR ?= .
 RTL_DIR = ../hw/rtl
 THIRD_PARTY_DIR = ../../third_party

-CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
 CXXFLAGS += -I. -I../common -I../../hw
 CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
 CXXFLAGS += -I$(THIRD_PARTY_DIR)
+CXXFLAGS += -DXLEN_$(XLEN)
 CXXFLAGS += $(CONFIGS)

-LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a 
-LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx 
+LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
-SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp
-
-OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
-VPATH := $(sort $(dir $(SRCS)))
-
-#$(info OBJS is $(OBJS))
-#$(info VPATH is $(VPATH))
+SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp

 # Debugigng
 ifdef DEBUG
 	CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
+	#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer
 else    
 	CXXFLAGS += -O2 -DNDEBUG
 endif

-# XLEN parameterization
-ifdef XLEN
-	CXXFLAGS += -DXLEN=$(XLEN)
-endif
-
 PROJECT = simx

 all: $(DESTDIR)/$(PROJECT)
 	
 $(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
-	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+	$(CXX) $(CXXFLAGS) -DSTARTUP_ADDR=0x80000000 $^ $(LDFLAGS) -o $@

 $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@
@@ -48,4 +39,4 @@ $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
 	$(CXX) $(CXXFLAGS) -MM $^ > .depend;

 clean:
-	rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
+	rm -rf $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -0,0 +1,87 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+
+#include <cstdlib>
+#include <stdio.h>
+#include "types.h"
+
+namespace vortex {
+
+class Arch {  
+private:
+  uint16_t num_threads_;
+  uint16_t num_warps_;
+  uint16_t num_cores_;  
+  uint16_t num_clusters_;  
+  uint16_t vsize_;
+  uint16_t num_regs_;
+  uint16_t num_csrs_;
+  uint16_t num_barriers_;
+  uint16_t ipdom_size_;
+  
+public:
+  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)   
+    : num_threads_(num_threads)
+    , num_warps_(num_warps)
+    , num_cores_(num_cores)
+    , num_clusters_(num_clusters)
+    , vsize_(16)
+    , num_regs_(32)
+    , num_csrs_(4096)
+    , num_barriers_(NUM_BARRIERS)
+    , ipdom_size_((num_threads-1) * 2)
+  {}
+
+  uint16_t vsize() const { 
+    return vsize_; 
+  }
+
+  uint16_t num_regs() const {
+    return num_regs_;
+  }
+
+  uint16_t num_csrs() const {
+    return num_csrs_;
+  }
+
+  uint16_t num_barriers() const {
+    return num_barriers_;
+  }
+
+  uint16_t ipdom_size() const {
+    return ipdom_size_;
+  }
+
+  uint16_t num_threads() const {
+    return num_threads_;
+  }
+
+  uint16_t num_warps() const {
+    return num_warps_;
+  }
+
+  uint16_t num_cores() const {
+    return num_cores_;
+  }
+  
+  uint16_t num_clusters() const {
+    return num_clusters_;
+  }
+};
+
+}
--- a/sim/simx/archdef.h
+++ b/sim/simx/archdef.h
@@ -1,70 +0,0 @@
-#pragma once
-
-#include <string>
-#include <sstream>
-
-#include <cstdlib>
-#include <stdio.h>
-#include "types.h"
-
-namespace vortex {
-
-class ArchDef {  
-private:
-  uint16_t num_cores_;
-  uint16_t num_warps_;
-  uint16_t num_threads_;
-  uint16_t wsize_;
-  uint16_t vsize_;
-  uint16_t num_regs_;
-  uint16_t num_csrs_;
-  uint16_t num_barriers_;
-  
-public:
-  ArchDef(uint16_t num_cores, 
-          uint16_t num_warps, 
-          uint16_t num_threads)   
-    : num_cores_(num_cores)
-    , num_warps_(num_warps)
-    , num_threads_(num_threads)
-    , wsize_(4)
-    , vsize_(16)
-    , num_regs_(32)
-    , num_csrs_(4096)
-    , num_barriers_(NUM_BARRIERS)
-  {}
-
-  uint16_t wsize() const { 
-    return wsize_; 
-  }
-
-  uint16_t vsize() const { 
-    return vsize_; 
-  }
-
-  uint16_t num_regs() const {
-    return num_regs_;
-  }
-
-  uint16_t num_csrs() const {
-    return num_csrs_;
-  }
-
-  uint16_t num_barriers() const {
-    return num_barriers_;
-  }
-
-  uint16_t num_threads() const {
-    return num_threads_;
-  }
-
-  uint16_t num_warps() const {
-    return num_warps_;
-  }
-
-  uint16_t num_cores() const {
-    return num_cores_;
-  }
-};
-
-}
--- a/sim/simx/args.cpp
+++ b/sim/simx/args.cpp
@@ -1,47 +0,0 @@
-#include <iostream>
-#include <string>
-#include "args.h"
-
-using namespace vortex;
-using std::string;
-
-std::string CommandLineArg::helpString_;
-std::unordered_map<string, CommandLineArg *> CommandLineArg::longArgs_;
-std::unordered_map<string, CommandLineArg *> CommandLineArg::shortArgs_;
-
-CommandLineArg::CommandLineArg(string s, string l, const char *helpText) {
-  helpString_ += helpText;
-  longArgs_[l] = this;
-  shortArgs_[s] = this;
-}
-
-CommandLineArg::CommandLineArg(string l, const char *helpText) {
-  helpString_ += helpText;
-  longArgs_[l] = this;
-}
-
-void CommandLineArg::readArgs(int argc, char **argv) {
-  for (int i = 0; i < argc; i++) {
-    std::unordered_map<string, CommandLineArg *>::iterator 
-      s = shortArgs_.find(std::string(argv[i])), 
-      l = longArgs_.find(std::string(argv[i]));
-
-    if (s != shortArgs_.end()) {
-      i += s->second->read(argc - i, &argv[i]);
-    } else if (l != longArgs_.end()) {
-      i += l->second->read(argc - i, &argv[i]);
-    } else {
-      throw BadArg(string(argv[i]));
-    }
-  }
-}
-
-void CommandLineArg::clearArgs() {
-  shortArgs_.clear();
-  longArgs_.clear();
-  helpString_ = "";
-}
-
-void CommandLineArg::showHelp(std::ostream &os) {
-  os << helpString_;
-}
--- a/sim/simx/args.h
+++ b/sim/simx/args.h
@@ -1,64 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <unordered_map>
-#include <util.h>
-
-namespace vortex {
-
-struct BadArg { BadArg(std::string s) : arg(s) {} std::string arg; };
-
-class CommandLineArg {
-public:
-  CommandLineArg(std::string s, std::string l, const char *helpText);
-  CommandLineArg(std::string l, const char *helpText);
-  virtual int read(int argc, char** argv) = 0;
-
-  static void readArgs(int argc, char **argv);
-  static void clearArgs();
-  static void showHelp(std::ostream &os);
-
-private:
-  static std::string helpString_;
-  static std::unordered_map<std::string, CommandLineArg *> longArgs_;
-  static std::unordered_map<std::string, CommandLineArg *> shortArgs_;
-};
-
-template <typename T> class CommandLineArgSetter : public CommandLineArg {
-public:
-  CommandLineArgSetter(std::string s, std::string l, const char *ht, T &x) :
-    CommandLineArg(s, l, ht), arg_(x) {}
-
-  CommandLineArgSetter(std::string l, const char *ht, T &x) :
-    CommandLineArg(l, ht), arg_(x) {}
-
-  int read(int argc, char **argv) {
-    __unused (argc);
-    std::istringstream iss(argv[1]);
-    iss >> arg_;
-    return 1;
-  }
-private:
-  T &arg_;
-};
-
-class CommandLineArgFlag : public CommandLineArg {
-public:
-  CommandLineArgFlag(std::string s, std::string l, const char *ht, bool &x) :
-    CommandLineArg(s, l, ht), arg_(x) { arg_ = false; }
-
-  CommandLineArgFlag(std::string l, const char *ht, bool &x) :
-    CommandLineArg(l, ht), arg_(x) { arg_ = false; }
-
-  int read(int argc, char **argv) { 
-    __unused (argc, argv);
-    arg_ = true; 
-    return 0; 
-  }
-private:
-  bool &arg_;
-};
-  
-}
--- a/sim/simx/cache.cpp
+++ b/sim/simx/cache.cpp
@@ -1,637 +0,0 @@
-#include "cache.h"
-#include "debug.h"
-#include "types.h"
-#include <util.h>
-#include <unordered_map>
-#include <vector>
-#include <list>
-#include <queue>
-
-using namespace vortex;
-
-struct params_t {
-    uint32_t sets_per_bank;
-    uint32_t blocks_per_set;    
-    uint32_t words_per_block;
-    uint32_t log2_num_inputs;
-
-    uint32_t word_select_addr_start;
-    uint32_t word_select_addr_end;
-
-    uint32_t bank_select_addr_start;
-    uint32_t bank_select_addr_end;
-
-    uint32_t set_select_addr_start;
-    uint32_t set_select_addr_end;
-
-    uint32_t tag_select_addr_start;
-    uint32_t tag_select_addr_end;
-
-    params_t(const Cache::Config& config) {
-        uint32_t bank_bits   = log2ceil(config.num_banks);
-        uint32_t offset_bits = config.B - config.W;
-        uint32_t log2_bank_size  = config.C - bank_bits;
-        uint32_t index_bits  = log2_bank_size - (config.B << config.A);        
-        assert(log2_bank_size >= config.B);   
-
-        this->log2_num_inputs = log2ceil(config.num_inputs);
-
-        this->words_per_block = 1 << offset_bits;
-        this->blocks_per_set  = 1 << config.A;
-        this->sets_per_bank   = 1 << index_bits;
-
-        assert(config.ports_per_bank <= this->words_per_block);
-                
-        // Word select
-        this->word_select_addr_start = config.W;
-        this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
-
-        // Bank select
-        this->bank_select_addr_start = (1+this->word_select_addr_end);
-        this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
-
-        // Set select
-        this->set_select_addr_start = (1+this->bank_select_addr_end);
-        this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
-
-        // Tag select
-        this->tag_select_addr_start = (1+this->set_select_addr_end);
-        this->tag_select_addr_end = (config.addr_width-1);
-    }
-
-    uint32_t addr_bank_id(uint64_t word_addr) const {
-        if (bank_select_addr_end >= bank_select_addr_start)
-            return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
-        else    
-            return 0;
-    }
-
-    uint32_t addr_set_id(uint64_t word_addr) const {
-        if (set_select_addr_end >= set_select_addr_start)
-            return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
-        else
-            return 0;
-    }
-
-    uint64_t addr_tag(uint64_t word_addr) const {
-        if (tag_select_addr_end >= tag_select_addr_start)
-            return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
-        else    
-            return 0;
-    }
-    
-    uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
-        uint64_t addr(0);
-        if (bank_select_addr_end >= bank_select_addr_start)            
-            addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
-        if (set_select_addr_end >= set_select_addr_start)
-            addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
-        if (tag_select_addr_end >= tag_select_addr_start)
-            addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
-        return addr;
-    }
-};
-
-struct block_t {
-    bool     valid;
-    bool     dirty;        
-    uint64_t tag;
-    uint32_t lru_ctr;
-};
-
-struct set_t {
-    std::vector<block_t> blocks;    
-    set_t(uint32_t size) : blocks(size) {}
-
-    void clear() {
-        for (auto& block : blocks) {
-            block.valid = false;
-        }
-    }
-};
-
-struct bank_req_info_t {
-    bool     valid;    
-    uint32_t req_id;
-    uint64_t req_tag;
-};
-
-struct bank_req_t {
-    bool valid;
-    bool write;
-    bool mshr_replay;
-    uint64_t tag;
-    uint32_t set_id;
-    uint32_t core_id;
-    uint64_t uuid;
-    std::vector<bank_req_info_t> infos;
-
-    bank_req_t(uint32_t size) 
-        : valid(false)
-        , write(false)
-        , mshr_replay(false)
-        , tag(0)
-        , set_id(0)
-        , core_id(0)
-        , uuid(0)
-        , infos(size)
-    {}
-};
-
-struct mshr_entry_t : public bank_req_t {
-    uint32_t block_id;
-
-    mshr_entry_t(uint32_t size = 0) 
-        : bank_req_t(size) 
-        , block_id(0)
-    {}
-};
-
-class MSHR {
-private:
-    std::vector<mshr_entry_t> entries_;
-    uint32_t size_;
-
-public:    
-    MSHR(uint32_t size)
-        : entries_(size)
-        , size_(0) 
-    {}
-
-    bool empty() const {
-        return (0 == size_);
-    }
-    
-    bool full() const {
-        return (size_ == entries_.size());
-    }
-
-    int lookup(const bank_req_t& bank_req) {
-         for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
-            auto& entry = entries_.at(i);
-            if (entry.valid 
-             && entry.set_id == bank_req.set_id 
-             && entry.tag == bank_req.tag) {
-                return i;
-            }
-        }
-        return -1;
-    }
-
-    int allocate(const bank_req_t& bank_req, uint32_t block_id) {
-        for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
-            auto& entry = entries_.at(i);
-            if (!entry.valid) {
-                *(bank_req_t*)&entry = bank_req;
-                entry.valid = true;
-                entry.mshr_replay = false;
-                entry.block_id = block_id;  
-                ++size_;              
-                return i;
-            }
-        }
-        return -1;
-    }
-
-    mshr_entry_t& replay(uint32_t id) {
-        auto& root_entry = entries_.at(id);
-        assert(root_entry.valid);
-        // make all related mshr entries for replay
-        for (auto& entry : entries_) {
-            if (entry.valid 
-             && entry.set_id == root_entry.set_id 
-             && entry.tag == root_entry.tag) {
-                entry.mshr_replay = true;
-            }
-        }
-        return root_entry;
-    }
-
-    bool pop(bank_req_t* out) {
-        for (auto& entry : entries_) {
-            if (entry.valid && entry.mshr_replay) {
-                *out = entry;
-                entry.valid = false;
-                --size_;
-                return true;
-            }
-        }
-        return false;
-    }
-
-    void clear() {
-        for (auto& entry : entries_) {
-            if (entry.valid && entry.mshr_replay) {
-                entry.valid = false;
-            }
-        }
-        size_ = 0;
-    }
-};
-
-struct bank_t {
-    std::vector<set_t>  sets;    
-    MSHR                mshr;
-
-    bank_t(const Cache::Config& config, 
-           const params_t& params) 
-        : sets(params.sets_per_bank, params.blocks_per_set)
-        , mshr(config.mshr_size)
-    {}
-
-    void clear() {
-        mshr.clear();
-        for (auto& set : sets) {
-            set.clear();
-        }
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-class Cache::Impl {
-private:
-    Cache* const simobject_;
-    Config config_;
-    params_t params_;
-    std::vector<bank_t> banks_;
-    Switch<MemReq, MemRsp>::Ptr mem_switch_;    
-    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
-    std::vector<SimPort<MemReq>> mem_req_ports_;
-    std::vector<SimPort<MemRsp>>  mem_rsp_ports_;
-    uint32_t flush_cycles_;
-    PerfStats perf_stats_;
-    uint64_t pending_read_reqs_;
-    uint64_t pending_write_reqs_;
-    uint64_t pending_fill_reqs_;    
-
-public:
-    Impl(Cache* simobject, const Config& config) 
-        : simobject_(simobject)
-        , config_(config)
-        , params_(config)
-        , banks_(config.num_banks, {config, params_})
-        , mem_req_ports_(config.num_banks, simobject)
-        , mem_rsp_ports_(config.num_banks, simobject)
-    {
-        bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
-        bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
-        simobject->MemRspPort.bind(&bypass_switch_->RspIn);
-
-        if (config.num_banks > 1) {
-            mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
-            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
-                mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
-                mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
-            }    
-            mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
-            bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
-        } else {
-            mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
-            bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
-        }
-
-        // calculate tag flush cycles
-        flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
-    }
-
-    void reset() {
-        for (auto& bank : banks_) {
-            bank.clear();
-        }
-        perf_stats_ = PerfStats();
-        pending_read_reqs_ = 0;
-        pending_write_reqs_ = 0;
-        pending_fill_reqs_ = 0;
-    }
-
-    void tick() {
-        // wait on flush cycles
-        if (flush_cycles_ != 0) {
-            --flush_cycles_;
-            return;
-        }
-
-        // per-bank pipeline request
-        std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
-
-        // calculate memory latency
-        perf_stats_.mem_latency += pending_fill_reqs_;
-
-        // handle bypasss responses
-        auto& bypass_port = bypass_switch_->RspOut.at(1);            
-        if (!bypass_port.empty()) {
-            auto& mem_rsp = bypass_port.front();
-            uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
-            uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
-            MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
-            simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
-            DT(3, simobject_->name() << "-" << core_rsp);
-            bypass_port.pop();
-        }        
-
-        // handle MSHR replay
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& bank = banks_.at(bank_id);
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-            bank.mshr.pop(&pipeline_req);
-        }       
-
-        // handle memory fills
-        std::vector<bool> pending_fill_req(config_.num_banks, false);
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
-            if (!mem_rsp_port.empty()) {
-                auto& mem_rsp = mem_rsp_port.front();
-                this->processMemoryFill(bank_id, mem_rsp.tag);                
-                pending_fill_req.at(bank_id) = true;
-                mem_rsp_port.pop();
-            }
-        }
-        
-        // handle incoming core requests
-        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
-            auto& core_req_port = simobject_->CoreReqPorts.at(req_id);            
-            if (core_req_port.empty())
-                continue;
-
-            auto& core_req = core_req_port.front();
-
-            // check cache bypassing
-            if (core_req.non_cacheable) {
-                // send IO request
-                this->processIORequest(core_req, req_id);
-
-                // remove request
-                core_req_port.pop();
-                continue;
-            }
-
-            auto bank_id = params_.addr_bank_id(core_req.addr);
-            auto set_id  = params_.addr_set_id(core_req.addr);
-            auto tag     = params_.addr_tag(core_req.addr);
-            auto port_id = req_id % config_.ports_per_bank;
-            
-            // create bank request
-            bank_req_t bank_req(config_.ports_per_bank);
-            bank_req.valid = true;
-            bank_req.write = core_req.write;
-            bank_req.mshr_replay = false;
-            bank_req.tag = tag;            
-            bank_req.set_id = set_id;       
-            bank_req.core_id = core_req.core_id;
-            bank_req.uuid = core_req.uuid;
-            bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
-
-            auto& bank = banks_.at(bank_id);            
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-
-            // check pending MSHR replay
-            if (pipeline_req.valid 
-             && pipeline_req.mshr_replay) {
-                 // stall
-                continue;
-            }    
-
-            // check pending fill request
-            if (pending_fill_req.at(bank_id)) {
-                // stall
-                continue;
-            }
-            
-            // check MSHR capacity if read or writeback
-            if ((!core_req.write || !config_.write_through)
-             && bank.mshr.full()) {
-                ++perf_stats_.mshr_stalls;
-                continue;
-            }    
-
-            // check bank conflicts
-            if (pipeline_req.valid) {
-                // check port conflict
-                if (pipeline_req.write != core_req.write
-                 || pipeline_req.set_id != set_id
-                 || pipeline_req.tag != tag
-                 || pipeline_req.infos[port_id].valid) {
-                    ++perf_stats_.bank_stalls;
-                    continue;
-                }
-                // update pending request infos
-                pipeline_req.infos[port_id] = bank_req.infos[port_id];
-            } else {
-                // schedule new request
-                pipeline_req = bank_req;
-            }
-
-            if (core_req.write)
-                ++perf_stats_.writes;
-            else
-                ++perf_stats_.reads;
-
-            // remove request
-            auto time = core_req_port.pop();
-            perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
-        }
-    
-        // process active request        
-        this->processBankRequest(pipeline_reqs);
-    } 
-
-    const PerfStats& perf_stats() const {
-        return perf_stats_;
-    }
-
-private:
-    
-    void processIORequest(const MemReq& core_req, uint32_t req_id) {
-        {
-            MemReq mem_req(core_req);
-            mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
-            bypass_switch_->ReqIn.at(1).send(mem_req, 1);
-            DT(3, simobject_->name() << "-" << mem_req);
-        }
-
-        if (core_req.write && config_.write_reponse) {
-            MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
-            simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);            
-            DT(3, simobject_->name() << "-" << core_rsp);
-        }
-    }
-
-    void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
-        // update block
-        auto& bank  = banks_.at(bank_id);
-        auto& entry = bank.mshr.replay(mshr_id);
-        auto& set   = bank.sets.at(entry.set_id);
-        auto& block = set.blocks.at(entry.block_id);
-        block.valid = true;
-        block.tag   = entry.tag;
-        --pending_fill_reqs_;
-    }
-
-    void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-            if (!pipeline_req.valid)
-                continue;
-
-            auto& bank = banks_.at(bank_id);
-            auto& set = bank.sets.at(pipeline_req.set_id);
-
-            if (pipeline_req.mshr_replay) {
-                // send core response
-                for (auto& info : pipeline_req.infos) {
-                    MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                    simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);  
-                    DT(3, simobject_->name() << "-" << core_rsp);         
-                }
-            } else {        
-                bool hit = false;
-                bool found_free_block = false;            
-                uint32_t hit_block_id = 0;
-                uint32_t repl_block_id = 0;            
-                uint32_t max_cnt = 0;
-                
-                for (uint32_t i = 0, n = set.blocks.size(); i < n; ++i) {
-                    auto& block = set.blocks.at(i);
-                    if (block.valid) {
-                        if (block.tag == pipeline_req.tag) {
-                            block.lru_ctr = 0;                        
-                            hit_block_id = i;
-                            hit = true;
-                        } else {
-                            ++block.lru_ctr;
-                        }
-                        if (max_cnt < block.lru_ctr) {
-                            max_cnt = block.lru_ctr;
-                            repl_block_id = i;
-                        }
-                    } else {                    
-                        found_free_block = true;
-                        repl_block_id = i;
-                    }
-                }
-
-                if (hit) {     
-                    //
-                    // Hit handling   
-                    //                
-                    if (pipeline_req.write) {
-                        // handle write hit
-                        auto& hit_block = set.blocks.at(hit_block_id);
-                        if (config_.write_through) {
-                            // forward write request to memory
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                        } else {
-                            // mark block as dirty
-                            hit_block.dirty = true;
-                        }
-                    }
-                    // send core response
-                    if (!pipeline_req.write || config_.write_reponse) {
-                        for (auto& info : pipeline_req.infos) {     
-                            MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                            simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
-                            DT(3, simobject_->name() << "-" << core_rsp);
-                        }
-                    }
-                } else {     
-                    //
-                    // Miss handling   
-                    //
-                    if (pipeline_req.write)
-                        ++perf_stats_.write_misses;
-                    else
-                        ++perf_stats_.read_misses;
-
-                    if (!found_free_block && !config_.write_through) {
-                        // write back dirty block
-                        auto& repl_block = set.blocks.at(repl_block_id);
-                        if (repl_block.dirty) {                       
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                            ++perf_stats_.evictions;
-                        }
-                    }
-
-                    if (pipeline_req.write && config_.write_through) {
-                        // forward write request to memory
-                        {
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                        }
-                        // send core response
-                        if (config_.write_reponse) {
-                            for (auto& info : pipeline_req.infos) {         
-                                MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                                simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
-                                DT(3, simobject_->name() << "-" << core_rsp);
-                            }
-                        }
-                    } else {
-                        // MSHR lookup
-                        int pending = bank.mshr.lookup(pipeline_req);
-
-                        // allocate MSHR
-                        int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
-                        
-                        // send fill request
-                        if (pending == -1) {
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
-                            mem_req.write = false;
-                            mem_req.tag   = mshr_id;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                            ++pending_fill_reqs_;
-                        }
-                    }
-                }
-            }
-        }
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-Cache::Cache(const SimContext& ctx, const char* name, const Config& config) 
-    : SimObject<Cache>(ctx, name)    
-    , CoreReqPorts(config.num_inputs, this)
-    , CoreRspPorts(config.num_inputs, this)
-    , MemReqPort(this)
-    , MemRspPort(this)
-    , impl_(new Impl(this, config))
-{}
-
-Cache::~Cache() {
-    delete impl_;
-}
-
-void Cache::reset() {
-    impl_->reset();
-}
-
-void Cache::tick() {
-    impl_->tick();
-}
-
-const Cache::PerfStats& Cache::perf_stats() const {
-    return impl_->perf_stats();
-}
--- a/sim/simx/cache_cluster.h
+++ b/sim/simx/cache_cluster.h
@@ -0,0 +1,106 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cache_sim.h"
+
+namespace vortex {
+
+class CacheCluster : public SimObject<CacheCluster> {
+public:
+    std::vector<std::vector<SimPort<MemReq>>> CoreReqPorts;
+    std::vector<std::vector<SimPort<MemRsp>>> CoreRspPorts;
+    SimPort<MemReq> MemReqPort;
+    SimPort<MemRsp> MemRspPort;
+
+    CacheCluster(const SimContext& ctx, 
+                 const char* name, 
+                 uint32_t num_units, 
+                 uint32_t num_caches, 
+                 uint32_t num_requests,
+                 const CacheSim::Config& config) 
+        : SimObject(ctx, name)        
+        , CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
+        , CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
+        , MemReqPort(this)
+        , MemRspPort(this)
+        , caches_(MAX(num_caches, 0x1)) {
+
+        CacheSim::Config config2(config);
+        if (0 == num_caches) {
+            num_caches = 1;
+            config2.bypass = true;
+        }
+
+        char sname[100];
+        
+        std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
+        for (uint32_t u = 0; u < num_units; ++u) {
+            snprintf(sname, 100, "%s-unit-arb-%d", name, u);
+            unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
+            for (uint32_t i = 0; i < num_requests; ++i) {
+                this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
+                unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
+            }
+        }
+
+        std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
+        for (uint32_t i = 0; i < config.num_inputs; ++i) {
+            snprintf(sname, 100, "%s-mem-arb-%d", name, i);
+            mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
+            for (uint32_t u = 0; u < num_units; ++u) {              
+                unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
+                mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
+            }            
+        }
+
+        snprintf(sname, 100, "%s-cache-arb", name);
+        auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
+
+        for (uint32_t i = 0; i < num_caches; ++i) {
+            snprintf(sname, 100, "%s-cache%d", name, i);
+            caches_.at(i) = CacheSim::Create(sname, config2);
+
+            for (uint32_t j = 0; j < config.num_inputs; ++j) {
+                mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
+                caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
+            }
+
+            caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i));
+            cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort);
+        }
+
+        cache_arb->ReqOut.at(0).bind(&this->MemReqPort);
+        this->MemRspPort.bind(&cache_arb->RspOut.at(0));
+    }
+
+    ~CacheCluster() {}
+
+    void reset() {}
+    
+    void tick() {}
+
+    CacheSim::PerfStats perf_stats() const {
+        CacheSim::PerfStats perf;
+        for (auto cache : caches_) {
+            perf += cache->perf_stats();
+        }   
+        return perf;
+    }
+    
+private:
+    std::vector<CacheSim::Ptr> caches_;
+};
+
+}
--- a/sim/simx/cache_sim.cpp
+++ b/sim/simx/cache_sim.cpp
@@ -0,0 +1,707 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cache_sim.h"
+#include "debug.h"
+#include "types.h"
+#include <util.h>
+#include <unordered_map>
+#include <vector>
+#include <list>
+#include <queue>
+
+using namespace vortex;
+
+struct params_t {
+    uint32_t sets_per_bank;
+    uint32_t lines_per_set;    
+    uint32_t words_per_line;
+    uint32_t log2_num_inputs;
+
+    uint32_t word_select_addr_start;
+    uint32_t word_select_addr_end;
+
+    uint32_t bank_select_addr_start;
+    uint32_t bank_select_addr_end;
+
+    uint32_t set_select_addr_start;
+    uint32_t set_select_addr_end;
+
+    uint32_t tag_select_addr_start;
+    uint32_t tag_select_addr_end;
+
+    params_t(const CacheSim::Config& config) {
+        int32_t bank_bits = log2ceil(config.num_banks);
+        int32_t offset_bits = config.B - config.W;
+        int32_t log2_bank_size = config.C - bank_bits;
+        int32_t index_bits = log2_bank_size - (config.B + config.A);        
+        assert(log2_bank_size > 0);
+        assert(offset_bits >= 0);
+        assert(index_bits >= 0);
+
+        this->log2_num_inputs = log2ceil(config.num_inputs);
+
+        this->words_per_line = 1 << offset_bits;
+        this->lines_per_set  = 1 << config.A;
+        this->sets_per_bank   = 1 << index_bits;
+
+        assert(config.ports_per_bank <= this->words_per_line);
+                
+        // Word select
+        this->word_select_addr_start = config.W;
+        this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
+
+        // Bank select
+        this->bank_select_addr_start = (1+this->word_select_addr_end);
+        this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
+
+        // Set select
+        this->set_select_addr_start = (1+this->bank_select_addr_end);
+        this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
+
+        // Tag select
+        this->tag_select_addr_start = (1+this->set_select_addr_end);
+        this->tag_select_addr_end = (config.addr_width-1);
+    }
+
+    uint32_t addr_bank_id(uint64_t word_addr) const {
+        if (bank_select_addr_end >= bank_select_addr_start)
+            return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
+        else    
+            return 0;
+    }
+
+    uint32_t addr_set_id(uint64_t word_addr) const {
+        if (set_select_addr_end >= set_select_addr_start)
+            return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
+        else
+            return 0;
+    }
+
+    uint64_t addr_tag(uint64_t word_addr) const {
+        if (tag_select_addr_end >= tag_select_addr_start)
+            return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
+        else    
+            return 0;
+    }
+    
+    uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
+        uint64_t addr(0);
+        if (bank_select_addr_end >= bank_select_addr_start)            
+            addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
+        if (set_select_addr_end >= set_select_addr_start)
+            addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
+        if (tag_select_addr_end >= tag_select_addr_start)
+            addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
+        return addr;
+    }
+};
+
+struct line_t {  
+    uint64_t tag;
+    uint32_t lru_ctr;
+    bool     valid;
+    bool     dirty;
+
+    void clear() {
+        valid = false;
+        dirty = false;
+    }
+};
+
+struct set_t {
+    std::vector<line_t> lines;
+
+    set_t(uint32_t num_ways) 
+        : lines(num_ways) 
+    {}
+
+    void clear() {
+        for (auto& line : lines) {
+            line.clear();
+        }
+    }
+};
+
+struct bank_req_port_t {
+    uint32_t req_id;
+    uint64_t req_tag;
+    bool     valid;
+
+    void clear() {
+        valid = false;   
+    }
+};
+
+struct bank_req_t {
+
+    enum ReqType {
+        None   = 0,
+        Fill   = 1,
+        Replay = 2,        
+        Core   = 3
+    };
+
+    std::vector<bank_req_port_t> ports;
+    uint64_t tag;
+    uint32_t set_id;
+    uint32_t cid;
+    uint64_t uuid;
+    ReqType  type;
+    bool     write;
+
+    bank_req_t(uint32_t num_ports)
+        : ports(num_ports) 
+    {}
+
+    void clear() {
+        for (auto& port : ports) {
+            port.clear();
+        }
+        type = ReqType::None;
+    }
+};
+
+struct mshr_entry_t {
+    bank_req_t bank_req;
+    uint32_t   line_id;
+
+    mshr_entry_t(uint32_t num_ports) 
+        : bank_req(num_ports) 
+    {}
+
+    void clear() {
+        bank_req.clear();
+    }
+};
+
+class MSHR {
+private:
+    std::vector<mshr_entry_t> entries_;
+    uint32_t size_;
+
+public:    
+    MSHR(uint32_t size, uint32_t num_ports)
+        : entries_(size, num_ports)
+        , size_(0) 
+    {}
+
+    bool empty() const {
+        return (0 == size_);
+    }
+    
+    bool full() const {
+        return (size_ == entries_.size());
+    }
+
+    bool lookup(const bank_req_t& bank_req) {
+         for (auto& entry : entries_) {;
+            if (entry.bank_req.type != bank_req_t::None
+             && entry.bank_req.set_id == bank_req.set_id 
+             && entry.bank_req.tag == bank_req.tag) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    int allocate(const bank_req_t& bank_req, uint32_t line_id) {
+        for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
+            auto& entry = entries_.at(i);
+            if (entry.bank_req.type == bank_req_t::None) {
+                entry.bank_req = bank_req;
+                entry.line_id = line_id;  
+                ++size_;              
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    mshr_entry_t& replay(uint32_t id) {
+        auto& root_entry = entries_.at(id);
+        assert(root_entry.bank_req.type == bank_req_t::Core);
+        // mark all related mshr entries for replay
+        for (auto& entry : entries_) {
+            if (entry.bank_req.type == bank_req_t::Core 
+             && entry.bank_req.set_id == root_entry.bank_req.set_id 
+             && entry.bank_req.tag == root_entry.bank_req.tag) {
+                entry.bank_req.type = bank_req_t::Replay;
+            }
+        }
+        return root_entry;
+    }
+
+    bool pop(bank_req_t* out) {
+        for (auto& entry : entries_) {
+            if (entry.bank_req.type == bank_req_t::Replay) {
+                *out = entry.bank_req;
+                entry.bank_req.type = bank_req_t::None;
+                --size_;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void clear() {
+        for (auto& entry : entries_) {
+            entry.clear();
+        }
+        size_ = 0;
+    }
+};
+
+struct bank_t {
+    std::vector<set_t> sets;    
+    MSHR               mshr;
+
+    bank_t(const CacheSim::Config& config, 
+           const params_t& params) 
+        : sets(params.sets_per_bank, params.lines_per_set)
+        , mshr(config.mshr_size, config.ports_per_bank)
+    {}
+
+    void clear() {        
+        for (auto& set : sets) {
+            set.clear();
+        }
+        mshr.clear();
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class CacheSim::Impl {
+private:
+    CacheSim* const simobject_;
+    Config config_;
+    params_t params_;
+    std::vector<bank_t> banks_;
+    Switch<MemReq, MemRsp>::Ptr bank_switch_;    
+    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
+    std::vector<SimPort<MemReq>> mem_req_ports_;
+    std::vector<SimPort<MemRsp>> mem_rsp_ports_;
+    std::vector<bank_req_t> pipeline_reqs_;
+    uint32_t init_cycles_;
+    PerfStats perf_stats_;
+    uint64_t pending_read_reqs_;
+    uint64_t pending_write_reqs_;
+    uint64_t pending_fill_reqs_;
+
+public:
+    Impl(CacheSim* simobject, const Config& config) 
+        : simobject_(simobject)
+        , config_(config)
+        , params_(config)
+        , banks_(config.num_banks, {config, params_})
+        , mem_req_ports_(config.num_banks, simobject)
+        , mem_rsp_ports_(config.num_banks, simobject)
+        , pipeline_reqs_(config.num_banks, config.ports_per_bank)
+    {
+        char sname[100];
+        snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
+
+        if (config_.bypass) {            
+            bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);            
+            for (uint32_t i = 0; i < config_.num_inputs; ++i) {
+               simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
+               bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
+            }
+            bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
+            simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
+            return;
+        }
+        
+        bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
+        bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
+        simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
+
+        if (config.num_banks > 1) {
+            snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
+            bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
+            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
+                mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
+                bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
+            }    
+            bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0));
+            bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0));
+        } else {
+            mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
+            bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0));
+        }
+
+        // calculate cache initialization cycles
+        init_cycles_ = params_.sets_per_bank * params_.lines_per_set;
+    }
+
+    void reset() {
+        if (config_.bypass)
+            return;
+
+        for (auto& bank : banks_) {
+            bank.clear();
+        }
+        perf_stats_ = PerfStats();
+        pending_read_reqs_  = 0;
+        pending_write_reqs_ = 0;
+        pending_fill_reqs_  = 0;
+    }
+
+    void tick() {
+        if (config_.bypass)
+            return;
+
+        // wait on cache initialization cycles
+        if (init_cycles_ != 0) {
+            --init_cycles_;
+            return;
+        }
+
+        // handle cache bypasss responses
+        {
+            auto& bypass_port = bypass_switch_->RspIn.at(1);            
+            if (!bypass_port.empty()) {
+                auto& mem_rsp = bypass_port.front();
+                this->processBypassResponse(mem_rsp);
+                bypass_port.pop();
+            }
+        }
+
+        // initialize pipeline request
+        for (auto& pipeline_req : pipeline_reqs_) {
+            pipeline_req.clear();
+        }
+
+        // schedule MSHR replay
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& bank = banks_.at(bank_id);
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+            bank.mshr.pop(&pipeline_req);
+        }
+
+        // schedule memory fill
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
+            if (mem_rsp_port.empty())
+                continue;
+
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+            if (pipeline_req.type != bank_req_t::None)
+                continue;
+
+            auto& mem_rsp = mem_rsp_port.front();            
+            DT(3, simobject_->name() << "-dram-" << mem_rsp);
+            pipeline_req.type = bank_req_t::Fill;
+            pipeline_req.tag = mem_rsp.tag;
+            mem_rsp_port.pop();
+        }
+
+        // schedule core requests        
+        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
+            auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
+            if (core_req_port.empty())
+                continue;
+
+            auto& core_req = core_req_port.front();
+
+            // check cache bypassing
+            if (core_req.type == AddrType::IO) {
+                // send bypass request
+                this->processBypassRequest(core_req, req_id);
+                // remove request
+                core_req_port.pop();
+                continue;
+            }
+
+            auto bank_id = params_.addr_bank_id(core_req.addr);
+            auto set_id  = params_.addr_set_id(core_req.addr);
+            auto tag     = params_.addr_tag(core_req.addr);
+            auto port_id = req_id % config_.ports_per_bank;
+
+            auto& bank = banks_.at(bank_id);
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+
+            // check MSHR capacity
+            if ((!core_req.write || !config_.write_through)
+             && bank.mshr.full()) {
+                ++perf_stats_.mshr_stalls;
+                ++perf_stats_.bank_stalls;
+                continue;
+            }            
+
+            // check bank conflicts
+            if (pipeline_req.type == bank_req_t::Core) {
+                // check port conflict
+                if (pipeline_req.write != core_req.write
+                 || pipeline_req.set_id != set_id
+                 || pipeline_req.tag != tag
+                 || pipeline_req.ports.at(port_id).valid) {
+                    ++perf_stats_.bank_stalls;
+                    continue;
+                }
+                // extend request ports
+                pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
+            } else if (pipeline_req.type == bank_req_t::None) {
+                // schedule new request
+                bank_req_t bank_req(config_.ports_per_bank);
+                bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
+                bank_req.tag   = tag;            
+                bank_req.set_id = set_id;       
+                bank_req.cid   = core_req.cid;
+                bank_req.uuid  = core_req.uuid;
+                bank_req.type  = bank_req_t::Core;
+                bank_req.write = core_req.write;
+                pipeline_req   = bank_req;
+            } else {
+                // bank in use
+                ++perf_stats_.bank_stalls;
+                continue;
+            }
+
+            if (core_req.write)
+                ++perf_stats_.writes;
+            else
+                ++perf_stats_.reads;
+
+            // remove request
+            DT(3, simobject_->name() << "-core-" << core_req);
+            auto time = core_req_port.pop();
+            perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
+        }
+    
+        // process active request        
+        this->processBankRequests();
+    } 
+
+    const PerfStats& perf_stats() const {
+        return perf_stats_;
+    }
+
+private:
+    
+    void processBypassResponse(const MemRsp& mem_rsp) {
+        uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
+        uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
+        MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid};
+        simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
+        DT(3, simobject_->name() << "-core-" << core_rsp);
+    }
+
+    void processBypassRequest(const MemReq& core_req, uint32_t req_id) {
+        DT(3, simobject_->name() << "-core-" << core_req);
+
+        {
+            MemReq mem_req(core_req);
+            mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
+            bypass_switch_->ReqIn.at(1).send(mem_req, 1);
+            DT(3, simobject_->name() << "-dram-" << mem_req);
+        }
+
+        if (core_req.write && config_.write_reponse) {
+            MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
+            simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);            
+            DT(3, simobject_->name() << "-core-" << core_rsp);
+        }
+    }
+
+    void processBankRequests() {
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& bank = banks_.at(bank_id);
+            auto pipeline_req = pipeline_reqs_.at(bank_id);
+            
+            switch (pipeline_req.type) {
+            case bank_req_t::None:
+                break;
+            case bank_req_t::Fill: {
+                // update cache line
+                auto& bank  = banks_.at(bank_id);
+                auto& entry = bank.mshr.replay(pipeline_req.tag);
+                auto& set   = bank.sets.at(entry.bank_req.set_id);
+                auto& line  = set.lines.at(entry.line_id);
+                line.valid  = true;
+                line.tag    = entry.bank_req.tag;
+                --pending_fill_reqs_;
+            } break;
+            case bank_req_t::Replay: {
+                // send core response
+                if (!pipeline_req.write || config_.write_reponse) {
+                    for (auto& info : pipeline_req.ports) {
+                        if (!info.valid)
+                            continue;
+                        MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                        simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);  
+                        DT(3, simobject_->name() << "-core-" << core_rsp);         
+                    }
+                }
+            } break;
+            case bank_req_t::Core: {        
+                bool hit = false;
+                bool found_free_line = false;            
+                uint32_t hit_line_id = 0;
+                uint32_t repl_line_id = 0;            
+                uint32_t max_cnt = 0;
+
+                auto& set = bank.sets.at(pipeline_req.set_id);
+
+                // tag lookup                
+                for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
+                    auto& line = set.lines.at(i);
+                    if (line.valid) {
+                        if (line.tag == pipeline_req.tag) {
+                            line.lru_ctr = 0;                        
+                            hit_line_id = i;
+                            hit = true;
+                        } else {
+                            ++line.lru_ctr;
+                        }
+                        if (max_cnt < line.lru_ctr) {
+                            max_cnt = line.lru_ctr;
+                            repl_line_id = i;
+                        }
+                    } else {                    
+                        found_free_line = true;
+                        repl_line_id = i;
+                    }
+                }
+
+                if (hit) {     
+                    //
+                    // Hit handling   
+                    //                
+                    if (pipeline_req.write) {
+                        // handle write hit
+                        auto& hit_line = set.lines.at(hit_line_id);
+                        if (config_.write_through) {
+                            // forward write request to memory
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                        } else {
+                            // mark line as dirty
+                            hit_line.dirty = true;
+                        }
+                    }
+                    // send core response
+                    if (!pipeline_req.write || config_.write_reponse) {
+                        for (auto& info : pipeline_req.ports) {     
+                            if (!info.valid)
+                                continue;
+                            MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                            simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
+                            DT(3, simobject_->name() << "-core-" << core_rsp);
+                        }
+                    }
+                } else {     
+                    //
+                    // Miss handling   
+                    //
+                    if (pipeline_req.write)
+                        ++perf_stats_.write_misses;
+                    else
+                        ++perf_stats_.read_misses;
+
+                    if (!found_free_line && !config_.write_through) {
+                        // write back dirty line
+                        auto& repl_line = set.lines.at(repl_line_id);
+                        if (repl_line.dirty) {                       
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                            ++perf_stats_.evictions;
+                        }
+                    }
+
+                    if (pipeline_req.write && config_.write_through) {
+                        // forward write request to memory
+                        {
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                        }
+                        // send core response
+                        if (config_.write_reponse) {
+                            for (auto& info : pipeline_req.ports) {
+                                if (!info.valid)
+                                    continue;       
+                                MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                                simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
+                                DT(3, simobject_->name() << "-core-" << core_rsp);
+                            }
+                        }
+                    } else {
+                        // MSHR lookup
+                        auto mshr_pending = bank.mshr.lookup(pipeline_req);
+
+                        // allocate MSHR
+                        auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
+                        
+                        // send fill request
+                        if (!mshr_pending) {
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
+                            mem_req.write = false;
+                            mem_req.tag   = mshr_id;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                            ++pending_fill_reqs_;
+                        }
+                    }
+                }
+            } break;
+            }
+        }
+        // calculate memory latency
+        perf_stats_.mem_latency += pending_fill_reqs_;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config) 
+    : SimObject<CacheSim>(ctx, name)    
+    , CoreReqPorts(config.num_inputs, this)
+    , CoreRspPorts(config.num_inputs, this)
+    , MemReqPort(this)
+    , MemRspPort(this)
+    , impl_(new Impl(this, config))
+{}
+
+CacheSim::~CacheSim() {
+    delete impl_;
+}
+
+void CacheSim::reset() {
+    impl_->reset();
+}
+
+void CacheSim::tick() {
+    impl_->tick();
+}
+
+const CacheSim::PerfStats& CacheSim::perf_stats() const {
+    return impl_->perf_stats();
+}
--- a/sim/simx/cache_sim.h
+++ b/sim/simx/cache_sim.h
@@ -1,13 +1,27 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
-#include "memsim.h"
+#include "mem_sim.h"

 namespace vortex {

-class Cache : public SimObject<Cache> {
+class CacheSim : public SimObject<CacheSim> {
 public:
    struct Config {
+        bool    bypass;         // cache bypass
        uint8_t C;              // log2 cache size
        uint8_t B;              // log2 block size
        uint8_t W;              // log2 word size
@@ -45,6 +59,19 @@ public:
            , mshr_stalls(0)
            , mem_latency(0)
        {}
+
+        PerfStats& operator+=(const PerfStats& rhs) {
+            this->reads += rhs.reads;
+            this->writes += rhs.writes;
+            this->read_misses += rhs.read_misses;
+            this->write_misses += rhs.write_misses;
+            this->evictions += rhs.evictions;
+            this->pipeline_stalls += rhs.pipeline_stalls;
+            this->bank_stalls += rhs.bank_stalls;
+            this->mshr_stalls += rhs.mshr_stalls;
+            this->mem_latency += rhs.mem_latency;
+            return *this;
+        }
    };

    std::vector<SimPort<MemReq>> CoreReqPorts;
@@ -52,8 +79,8 @@ public:
    SimPort<MemReq>              MemReqPort;
    SimPort<MemRsp>              MemRspPort;

-    Cache(const SimContext& ctx, const char* name, const Config& config);
-    ~Cache();
+    CacheSim(const SimContext& ctx, const char* name, const Config& config);
+    ~CacheSim();

    void reset();
    
--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@@ -0,0 +1,219 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cluster.h"
+
+using namespace vortex;
+
+Cluster::Cluster(const SimContext& ctx, 
+                 uint32_t cluster_id,
+                 ProcessorImpl* processor, 
+                 const Arch &arch, const 
+                 DCRS &dcrs) 
+  : SimObject(ctx, "cluster")
+  , mem_req_port(this)
+  , mem_rsp_port(this)
+  , cluster_id_(cluster_id)
+  , cores_(arch.num_cores())  
+  , barriers_(arch.num_barriers(), 0)
+  , sharedmems_(arch.num_cores())
+  , processor_(processor)
+{
+  auto num_cores = arch.num_cores();
+  
+  char sname[100];
+  snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
+  l2cache_ = CacheSim::Create(sname, CacheSim::Config{
+    !L2_ENABLED,
+    log2ceil(L2_CACHE_SIZE), // C
+    log2ceil(MEM_BLOCK_SIZE), // B
+    log2ceil(L2_NUM_WAYS),  // W
+    0,                      // A
+    XLEN,                   // address bits  
+    L2_NUM_BANKS,           // number of banks
+    1,                      // number of ports
+    5,                      // request size 
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    L2_MSHR_SIZE,           // mshr
+    2,                      // pipeline latency
+  });
+
+  l2cache_->MemReqPort.bind(&this->mem_req_port);
+  this->mem_rsp_port.bind(&l2cache_->MemRspPort);
+
+  snprintf(sname, 100, "cluster%d-icaches", cluster_id);
+  icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
+    !ICACHE_ENABLED,
+    log2ceil(ICACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // B
+    log2ceil(sizeof(uint32_t)), // W
+    log2ceil(ICACHE_NUM_WAYS),// A
+    XLEN,                   // address bits    
+    1,                      // number of banks
+    1,                      // number of ports
+    1,                      // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    (uint8_t)arch.num_warps(), // mshr
+    2,                      // pipeline latency
+  });
+
+  icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
+  l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
+
+  snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
+  dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
+    !DCACHE_ENABLED,
+    log2ceil(DCACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // B
+    log2ceil(sizeof(Word)), // W
+    log2ceil(DCACHE_NUM_WAYS),// A
+    XLEN,                   // address bits    
+    DCACHE_NUM_BANKS,       // number of banks
+    1,                      // number of ports
+    DCACHE_NUM_BANKS,       // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    DCACHE_MSHR_SIZE,       // mshr
+    4,                      // pipeline latency
+  });
+
+  dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
+  l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
+
+  ///////////////////////////////////////////////////////////////////////////
+
+  // create shared memory blocks
+  for (uint32_t i = 0; i < num_cores; ++i) {
+    snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
+    sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
+      (1 << SMEM_LOG_SIZE),
+      sizeof(Word),
+      NUM_LSU_LANES, 
+      NUM_LSU_LANES,
+      false
+    });
+  }
+
+  // create cores
+
+  for (uint32_t i = 0; i < num_cores; ++i) {  
+    uint32_t core_id = cluster_id * num_cores + i;
+    cores_.at(i) = Core::Create(core_id, 
+                                this, 
+                                arch, 
+                                dcrs, 
+                                sharedmems_.at(i));
+
+    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
+    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      
+
+    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
+      snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
+      auto smem_demux = SMemDemux::Create(sname);
+      
+      cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
+      smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));        
+      
+      smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
+      dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
+
+      smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
+      sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
+    }
+  }
+}
+
+Cluster::~Cluster() {
+  //--
+}
+
+void Cluster::reset() {  
+  for (auto& barrier : barriers_) {
+    barrier.reset();
+  }
+}
+
+void Cluster::tick() {
+  //--
+}
+
+void Cluster::attach_ram(RAM* ram) {
+  for (auto core : cores_) {
+    core->attach_ram(ram);
+  }
+}
+
+bool Cluster::running() const {
+  for (auto& core : cores_) {
+    if (core->running())
+      return true;
+  }
+  return false;
+}
+
+bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
+  bool done = true;
+  Word exitcode_ = 0;
+  for (auto& core : cores_) {
+    Word ec;
+    if (core->check_exit(&ec, riscv_test)) {
+      exitcode_ |= ec;
+    } else {
+      done = false;
+    }
+  }
+  *exitcode = exitcode_;
+  return done;
+}
+
+void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
+  auto& barrier = barriers_.at(bar_id);
+
+  uint32_t local_core_id = core_id % cores_.size();
+  barrier.set(local_core_id);
+
+  DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
+
+  if (barrier.count() == (size_t)count) {
+      // resume all suspended cores
+      for (uint32_t i = 0; i < cores_.size(); ++i) {
+        if (barrier.test(i)) {
+          DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
+          cores_.at(i)->resume();
+        }
+      }
+      barrier.reset();
+    }
+}
+
+ProcessorImpl* Cluster::processor() const {
+  return processor_;
+}
+
+Cluster::PerfStats Cluster::perf_stats() const {
+  Cluster::PerfStats perf;
+  perf.icache = icaches_->perf_stats();
+  perf.dcache = dcaches_->perf_stats();
+  perf.l2cache = l2cache_->perf_stats();
+
+  for (auto sharedmem : sharedmems_) {
+    perf.sharedmem += sharedmem->perf_stats();
+  }
+  
+  return perf;
+}
--- a/sim/simx/cluster.h
+++ b/sim/simx/cluster.h
@@ -0,0 +1,86 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "dcrs.h"
+#include "arch.h"
+#include "cache_cluster.h"
+#include "shared_mem.h"
+#include "core.h"
+#include "constants.h"
+
+namespace vortex {
+
+class ProcessorImpl;
+
+class Cluster : public SimObject<Cluster> {
+public:
+  struct PerfStats {
+    CacheSim::PerfStats   icache;
+    CacheSim::PerfStats   dcache;
+    SharedMem::PerfStats  sharedmem;
+    CacheSim::PerfStats   l2cache;
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->icache      += rhs.icache;
+      this->dcache      += rhs.dcache;
+      this->sharedmem   += rhs.sharedmem;
+      this->l2cache     += rhs.l2cache;
+      return *this;
+    }
+  };
+
+  SimPort<MemReq> mem_req_port;
+  SimPort<MemRsp> mem_rsp_port;
+
+  Cluster(const SimContext& ctx, 
+          uint32_t cluster_id,
+          ProcessorImpl* processor, 
+          const Arch &arch, 
+          const DCRS &dcrs);
+
+  ~Cluster();
+
+  void reset();
+
+  void tick();
+
+  void attach_ram(RAM* ram);
+
+  bool running() const;
+
+  bool check_exit(Word* exitcode, bool riscv_test) const;  
+
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
+
+  ProcessorImpl* processor() const;
+
+  Cluster::PerfStats perf_stats() const;
+  
+private:
+  uint32_t                     cluster_id_;  
+  std::vector<Core::Ptr>       cores_;  
+  std::vector<CoreMask>        barriers_;
+  CacheSim::Ptr                l2cache_;
+  CacheCluster::Ptr            icaches_;
+  CacheCluster::Ptr            dcaches_;
+  std::vector<SharedMem::Ptr>  sharedmems_;
+  CacheCluster::Ptr            tcaches_;
+  CacheCluster::Ptr            ocaches_;
+  CacheCluster::Ptr            rcaches_;
+  ProcessorImpl*               processor_;
+};
+
+} // namespace vortex
--- a/sim/simx/constants.h
+++ b/sim/simx/constants.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #ifndef RAM_PAGE_SIZE
@@ -10,14 +23,4 @@

 #ifndef MEMORY_BANKS
 #define MEMORY_BANKS 2
-#endif
-
-namespace vortex {
-
-enum Constants {
-
-    SMEM_BANK_OFFSET = log2ceil(sizeof(uint32_t)) + log2ceil(STACK_SIZE / sizeof(uint32_t)),
-
-};
-
-}
+#endif
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <string>
@@ -11,101 +24,104 @@
 #include <simobject.h>
 #include "debug.h"
 #include "types.h"
-#include "archdef.h"
+#include "arch.h"
 #include "decode.h"
 #include "mem.h"
 #include "warp.h"
 #include "pipeline.h"
-#include "cache.h"
-#include "sharedmem.h"
+#include "cache_sim.h"
+#include "shared_mem.h"
 #include "ibuffer.h"
 #include "scoreboard.h"
-#include "exeunit.h"
-#include "tex_unit.h"
+#include "operand.h"
+#include "dispatcher.h"
+#include "exe_unit.h"
+#include "dcrs.h"

 namespace vortex {

+class Cluster;
+
 class Core : public SimObject<Core> {
 public:
  struct PerfStats {
+    uint64_t cycles;
    uint64_t instrs;
    uint64_t ibuf_stalls;
    uint64_t scrb_stalls;
    uint64_t alu_stalls;
    uint64_t lsu_stalls;
-    uint64_t csr_stalls;
    uint64_t fpu_stalls;
-    uint64_t gpu_stalls;
+    uint64_t sfu_stalls;
+    uint64_t ifetches;
    uint64_t loads;
    uint64_t stores;
-    uint64_t branches;
-    uint64_t mem_reads;
-    uint64_t mem_writes;
-    uint64_t mem_latency;
-    uint64_t tex_reads;
-    uint64_t tex_latency;
+    uint64_t ifetch_latency;
+    uint64_t load_latency;

    PerfStats() 
-      : instrs(0)
+      : cycles(0)
+      , instrs(0)
      , ibuf_stalls(0)
      , scrb_stalls(0)
      , alu_stalls(0)
      , lsu_stalls(0)
-      , csr_stalls(0)
      , fpu_stalls(0)
-      , gpu_stalls(0)
+      , sfu_stalls(0)
+      , ifetches(0)
      , loads(0)
      , stores(0)
-      , branches(0)
-      , mem_reads(0)
-      , mem_writes(0)
-      , mem_latency(0)
-      , tex_reads(0)
-      , tex_latency(0)
+      , ifetch_latency(0)
+      , load_latency(0)
    {}
  };

-  SimPort<MemRsp> MemRspPort;
-  SimPort<MemReq> MemReqPort;
+  std::vector<SimPort<MemReq>> icache_req_ports;
+  std::vector<SimPort<MemRsp>> icache_rsp_ports;
+
+  std::vector<SimPort<MemReq>> dcache_req_ports;
+  std::vector<SimPort<MemRsp>> dcache_rsp_ports;
+
+  Core(const SimContext& ctx, 
+       uint32_t core_id, 
+       Cluster* cluster,
+       const Arch &arch, 
+       const DCRS &dcrs,
+       SharedMem::Ptr  sharedmem);

-  Core(const SimContext& ctx, const ArchDef &arch, uint32_t id);
  ~Core();

-  void attach_ram(RAM* ram);
-
-  bool running() const;
-
  void reset();

  void tick();

+  void attach_ram(RAM* ram);
+
+  bool running() const;
+
+  void resume();
+
  uint32_t id() const {
-    return id_;
+    return core_id_;
  }

-  const Decoder& decoder() {
-    return decoder_;
-  }
-
-  const ArchDef& arch() const {
+  const Arch& arch() const {
    return arch_;
  }

-  const PerfStats& perf_stats() const {
-    return perf_stats_;
-  } 
-
-  uint32_t getIRegValue(int reg) const {
-    return warps_.at(0)->getIRegValue(reg);
+  const DCRS& dcrs() const {
+    return dcrs_;
  }

  uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
  
  void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);

-  WarpMask wspawn(uint32_t num_warps, uint32_t nextPC);
+  void wspawn(uint32_t num_warps, Word nextPC);
  
-  WarpMask barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
+
+  AddrType get_addr_type(uint64_t addr);

  void icache_read(void* data, uint64_t addr, uint32_t size);

@@ -113,19 +129,22 @@ public:

  void dcache_write(const void* data, uint64_t addr, uint32_t size);

-  uint32_t tex_read(uint32_t unit, uint32_t lod, uint32_t u, uint32_t v, std::vector<mem_addr_size_t>* mem_addrs);
+  void dcache_amo_reserve(uint64_t addr);
+
+  bool dcache_amo_check(uint64_t addr);

  void trigger_ecall();

  void trigger_ebreak();

-  bool check_exit() const;
+  bool check_exit(Word* exitcode, bool riscv_test) const;

 private:

  void schedule();
  void fetch();
  void decode();
+  void issue();
  void execute();
  void commit();
  
@@ -133,49 +152,51 @@ private:

  void cout_flush();

-  uint32_t id_;
-  const ArchDef arch_;
+  uint32_t core_id_;
+  const Arch& arch_;
+  const DCRS &dcrs_;
+  
  const Decoder decoder_;
  MemoryUnit mmu_;
-  RAM smem_;
-  std::vector<TexUnit> tex_units_;

  std::vector<std::shared_ptr<Warp>> warps_;  
-  std::vector<WarpMask> barriers_;  
-  std::vector<uint32_t> csrs_;
+  std::vector<WarpMask> barriers_;
  std::vector<Byte> fcsrs_;
  std::vector<IBuffer> ibuffers_;
  Scoreboard scoreboard_;
+  std::vector<Operand::Ptr> operands_;
+  std::vector<Dispatcher::Ptr> dispatchers_;
  std::vector<ExeUnit::Ptr> exe_units_;
-  Cache::Ptr icache_;
-  Cache::Ptr dcache_;
-  SharedMem::Ptr shared_mem_;
-  Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
-  std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
+  SharedMem::Ptr sharedmem_;

  PipelineLatch fetch_latch_;
  PipelineLatch decode_latch_;
  
  HashTable<pipeline_trace_t*> pending_icache_;
+  std::vector<pipeline_trace_t*> committed_traces_;
  WarpMask active_warps_;
  WarpMask stalled_warps_;
-  uint32_t last_schedule_wid_;
  uint64_t issued_instrs_;
  uint64_t committed_instrs_;
-  uint32_t csr_tex_unit_;
-  bool ecall_;
-  bool ebreak_;
+  bool exited_;
+
+  uint64_t pending_ifetches_;

  std::unordered_map<int, std::stringstream> print_bufs_;
+
+  std::vector<std::vector<CSRs>> csrs_;
  
  PerfStats perf_stats_;
-  uint64_t perf_mem_pending_reads_;
+  
+  Cluster* cluster_;

+  uint32_t commit_exe_;
+
+  friend class Warp;
  friend class LsuUnit;
  friend class AluUnit;
-  friend class CsrUnit;
  friend class FpuUnit;
-  friend class GpuUnit;
+  friend class SfuUnit;
 };

-} // namespace vortex
+} // namespace vortex
--- a/sim/simx/dcrs.cpp
+++ b/sim/simx/dcrs.cpp
@@ -0,0 +1,28 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dcrs.h"
+#include <iostream>
+
+using namespace vortex;
+
+void DCRS::write(uint32_t addr, uint32_t value) {     
+  if (addr >= VX_DCR_BASE_STATE_BEGIN
+   && addr < VX_DCR_BASE_STATE_END) {
+      base_dcrs.write(addr, value);
+      return;
+  }
+
+  std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl;
+  std::abort();
+}
--- a/sim/simx/dcrs.h
+++ b/sim/simx/dcrs.h
@@ -0,0 +1,45 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <util.h>
+#include <VX_types.h>
+#include <array>
+
+namespace vortex {
+
+class BaseDCRS {
+public:
+    uint32_t read(uint32_t addr) const {
+        uint32_t state = VX_DCR_BASE_STATE(addr);
+        return states_.at(state);
+    }
+
+    void write(uint32_t addr, uint32_t value) {
+        uint32_t state = VX_DCR_BASE_STATE(addr);
+        states_.at(state) = value;
+    }
+
+private:    
+    std::array<uint32_t, VX_DCR_BASE_STATE_COUNT> states_;
+};
+
+class DCRS {
+public:
+    void write(uint32_t addr, uint32_t value);
+    
+    BaseDCRS base_dcrs;
+};
+
+}
--- a/sim/simx/debug.h
+++ b/sim/simx/debug.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #ifndef DEBUG_LEVEL
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <string>
 #include <stdlib.h>
@@ -9,41 +22,36 @@
 #include "debug.h"
 #include "types.h"
 #include "decode.h"
-#include "archdef.h"
+#include "arch.h"
 #include "instr.h"

 using namespace vortex;

-struct InstTableEntry_t {
-  bool controlFlow;
-  InstType iType;
-};
-
-static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable = {
-  {Opcode::NOP,        {false, InstType::N_TYPE}},
-  {Opcode::R_INST,     {false, InstType::R_TYPE}},
-  {Opcode::L_INST,     {false, InstType::I_TYPE}},
-  {Opcode::I_INST,     {false, InstType::I_TYPE}},
-  {Opcode::S_INST,     {false, InstType::S_TYPE}},
-  {Opcode::B_INST,     {true , InstType::B_TYPE}},
-  {Opcode::LUI_INST,   {false, InstType::U_TYPE}},
-  {Opcode::AUIPC_INST, {false, InstType::U_TYPE}},
-  {Opcode::JAL_INST,   {true , InstType::J_TYPE}},
-  {Opcode::JALR_INST,  {true , InstType::I_TYPE}},
-  {Opcode::SYS_INST,   {true , InstType::I_TYPE}},
-  {Opcode::FENCE,      {true , InstType::I_TYPE}},
-  {Opcode::FL,         {false, InstType::I_TYPE}},
-  {Opcode::FS,         {false, InstType::S_TYPE}},
-  {Opcode::FCI,        {false, InstType::R_TYPE}}, 
-  {Opcode::FMADD,      {false, InstType::R4_TYPE}},
-  {Opcode::FMSUB,      {false, InstType::R4_TYPE}},
-  {Opcode::FMNMADD,    {false, InstType::R4_TYPE}},
-  {Opcode::FMNMSUB,    {false, InstType::R4_TYPE}},  
-  {Opcode::VSET,       {false, InstType::V_TYPE}}, 
-  {Opcode::GPGPU,      {false, InstType::R_TYPE}},
-  {Opcode::GPU,        {false, InstType::R4_TYPE}},
-  {Opcode::R_INST_W,   {false, InstType::R_TYPE}},
-  {Opcode::I_INST_W,   {false, InstType::I_TYPE}},
+static const std::unordered_map<Opcode, InstType> sc_instTable = {
+  {Opcode::R_INST,     InstType::R_TYPE},
+  {Opcode::L_INST,     InstType::I_TYPE},
+  {Opcode::I_INST,     InstType::I_TYPE},
+  {Opcode::S_INST,     InstType::S_TYPE},
+  {Opcode::B_INST,     InstType::B_TYPE},
+  {Opcode::LUI_INST,   InstType::U_TYPE},
+  {Opcode::AUIPC_INST, InstType::U_TYPE},
+  {Opcode::JAL_INST,   InstType::J_TYPE},
+  {Opcode::JALR_INST,  InstType::I_TYPE},
+  {Opcode::SYS_INST,   InstType::I_TYPE},
+  {Opcode::FENCE,      InstType::I_TYPE},
+  {Opcode::AMO,        InstType::R_TYPE},
+  {Opcode::FL,         InstType::I_TYPE},
+  {Opcode::FS,         InstType::S_TYPE},
+  {Opcode::FCI,        InstType::R_TYPE}, 
+  {Opcode::FMADD,      InstType::R4_TYPE},
+  {Opcode::FMSUB,      InstType::R4_TYPE},
+  {Opcode::FMNMADD,    InstType::R4_TYPE},
+  {Opcode::FMNMSUB,    InstType::R4_TYPE},  
+  {Opcode::VSET,       InstType::V_TYPE},
+  {Opcode::EXT1,       InstType::R_TYPE},
+  {Opcode::EXT2,       InstType::R4_TYPE},
+  {Opcode::R_INST_W,   InstType::R_TYPE},
+  {Opcode::I_INST_W,   InstType::I_TYPE},
 };

 enum Constants {
@@ -58,6 +66,8 @@ enum Constants {
  width_i_imm = 12,
  width_j_imm = 20,
  width_v_imm = 11,
+  width_aq    = 1,
+  width_rl    = 1,

  shift_opcode= 0,
  shift_rd    = width_opcode,
@@ -72,15 +82,15 @@ enum Constants {
  shift_func6 = shift_func7 + width_vmask,
  shift_vset  = shift_func7 + width_func6,

-  mask_opcode = (1<<width_opcode)-1,  
-  mask_reg    = (1<<width_reg)-1,
-  mask_func2  = (1<<width_func2)-1,
-  mask_func3  = (1<<width_func3)-1,
-  mask_func6  = (1<<width_func6)-1,
-  mask_func7  = (1<<width_func7)-1,
-  mask_i_imm  = (1<<width_i_imm)-1,
-  mask_j_imm  = (1<<width_j_imm)-1,
-  mask_v_imm  = (1<<width_v_imm)-1,
+  mask_opcode = (1 << width_opcode) - 1,  
+  mask_reg    = (1 << width_reg)   - 1,
+  mask_func2  = (1 << width_func2) - 1,
+  mask_func3  = (1 << width_func3) - 1,
+  mask_func6  = (1 << width_func6) - 1,
+  mask_func7  = (1 << width_func7) - 1,
+  mask_i_imm  = (1 << width_i_imm) - 1,
+  mask_j_imm  = (1 << width_j_imm) - 1,
+  mask_v_imm  = (1 << width_v_imm) - 1,
 };

 static const char* op_string(const Instr &instr) {
@@ -92,7 +102,6 @@ static const char* op_string(const Instr &instr) {
  auto imm    = instr.getImm();

  switch (opcode) {
-  case Opcode::NOP:        return "NOP";
  case Opcode::LUI_INST:   return "LUI";
  case Opcode::AUIPC_INST: return "AUIPC";
  case Opcode::R_INST:
@@ -116,7 +125,7 @@ static const char* op_string(const Instr &instr) {
      case 2: return "SLT";
      case 3: return "SLTU";
      case 4: return "XOR";
-      case 5: return func7 ? "SRA" : "SRL";
+      case 5: return (func7 & 0x20) ? "SRA" : "SRL";
      case 6: return "OR";
      case 7: return "AND";
      default:
@@ -130,7 +139,7 @@ static const char* op_string(const Instr &instr) {
    case 2: return "SLTI";
    case 3: return "SLTIU";
    case 4: return "XORI";
-    case 5: return func7 ? "SRAI" : "SRLI";
+    case 5: return (func7 & 0x20) ? "SRAI" : "SRLI";
    case 6: return "ORI";
    case 7: return "ANDI";
    default:
@@ -151,8 +160,8 @@ static const char* op_string(const Instr &instr) {
  case Opcode::JALR_INST:  return "JALR";
  case Opcode::L_INST:
    switch (func3) {
-    case 0: return "LBI";
-    case 1: return "LHI";
+    case 0: return "LB";
+    case 1: return "LH";
    case 2: return "LW";
    case 3: return "LD";
    case 4: return "LBU";
@@ -192,11 +201,11 @@ static const char* op_string(const Instr &instr) {
    }
  case Opcode::I_INST_W:
    switch (func3) {
-      case 0: return "ADDIW";
-      case 1: return "SLLIW";
-      case 5: return func7 ? "SRAIW" : "SRLIW";
-      default:
-        std::abort();
+    case 0: return "ADDIW";
+    case 1: return "SLLIW";
+    case 5: return func7 ? "SRAIW" : "SRLIW";
+    default:
+      std::abort();
    }
  case Opcode::SYS_INST: 
    switch (func3) {
@@ -222,20 +231,59 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FENCE: return "FENCE";
  case Opcode::FL: 
    switch (func3) {
-      case 0x1: return "VL";
-      case 0x2: return "FLW";
-      case 0x3: return "FLD";
-      default: 
-        std::abort();
+    case 0x1: return "VL";
+    case 0x2: return "FLW";
+    case 0x3: return "FLD";
+    default: 
+      std::abort();
    }
  case Opcode::FS: 
    switch (func3) {
-      case 0x1: return "VS";
-      case 0x2: return "FSW";
-      case 0x3: return "FSD";
+    case 0x1: return "VS";
+    case 0x2: return "FSW";
+    case 0x3: return "FSD";
+    default: 
+      std::abort();
+    }
+  case Opcode::AMO: {
+    auto amo_type = func7 >> 2;
+    switch (func3) {
+      case 0x2:
+        switch (amo_type) {
+        case 0x00: return "AMOADD.W";
+        case 0x01: return "AMOSWAP.W";
+        case 0x02: return "LR.W";
+        case 0x03: return "SC.W";
+        case 0x04: return "AMOXOR.W";
+        case 0x08: return "AMOOR.W";
+        case 0x0c: return "AMOAND.W";
+        case 0x10: return "AMOMIN.W";
+        case 0x14: return "AMOMAX.W";
+        case 0x18: return "AMOMINU.W";
+        case 0x1c: return "AMOMAXU.W";
+        default:
+          std::abort();
+        }
+      case 0x3:
+        switch (amo_type) {
+        case 0x00: return "AMOADD.D";
+        case 0x01: return "AMOSWAP.D";
+        case 0x02: return "LR.D";
+        case 0x03: return "SC.D";
+        case 0x04: return "AMOXOR.D";
+        case 0x08: return "AMOOR.D";
+        case 0x0c: return "AMOAND.D";
+        case 0x10: return "AMOMIN.D";
+        case 0x14: return "AMOMAX.D";
+        case 0x18: return "AMOMINU.D";
+        case 0x1c: return "AMOMAXU.D";
+        default:
+          std::abort();
+        }
      default: 
        std::abort();
    }
+  }
  case Opcode::FCI: 
    switch (func7) {
    case 0x00: return "FADD.S";
@@ -332,9 +380,9 @@ static const char* op_string(const Instr &instr) {
      default:
        std::abort();
      }
-    case 0x70: return func3 ? "FCLASS.S" : "FMV.X.W";
+    case 0x70: return func3 ? "FCLASS.S" : "FMV.X.S";
    case 0x71: return func3 ? "FCLASS.D" : "FMV.X.D";
-    case 0x78: return "FMV.W.X";
+    case 0x78: return "FMV.S.X";
    case 0x79: return "FMV.D.X";
    default:
      std::abort();
@@ -344,23 +392,27 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
  case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
  case Opcode::VSET:    return "VSET";
-  case Opcode::GPGPU:
-    switch (func3) {            
-    case 0: return "TMC";
-    case 1: return "WSPAWN";
-    case 2: return "SPLIT";
-    case 3: return "JOIN";
-    case 4: return "BAR";
-    case 5: return "PREFETCH";
+  case Opcode::EXT1:
+    switch (func7) {
+    case 0:
+      switch (func3) {            
+      case 0: return "TMC";
+      case 1: return "WSPAWN";
+      case 2: return "SPLIT";
+      case 3: return "JOIN";
+      case 4: return "BAR";
+      case 5: return "PRED";
+      default:
+        std::abort();
+      }
    default:
      std::abort();
    }
-  case Opcode::GPU:
+  case Opcode::EXT2:
    switch (func3) {
-    case 0: return "TEX";
    case 1: {
      switch (func2) {
-      case 0: return "CMOV";
+      case 0: return "CMOV"; 
      default:
        std::abort();
      }
@@ -375,43 +427,36 @@ static const char* op_string(const Instr &instr) {

 namespace vortex {
 std::ostream &operator<<(std::ostream &os, const Instr &instr) {  
-  auto opcode = instr.getOpcode();    
-  auto func2  = instr.getFunc2();
+  auto opcode = instr.getOpcode();
  auto func3  = instr.getFunc3();

-  os << op_string(instr) << ": ";
-
-  if (opcode == S_INST 
-   || opcode == FS) {     
-     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
-     os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
-  } else 
-  if (opcode == L_INST 
-   || opcode == FL) {     
-     os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
-     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
-  } else {
-    if (instr.getRDType() != RegType::None) {
-      os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
-    }
-    uint32_t i = 0;
-    for (; i < instr.getNRSrc(); ++i) {    
-      if (i) os << ", ";
-      os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
-    }    
-    if (instr.hasImm()) {
-      if (i) os << ", ";
-      os << "imm=0x" << std::hex << instr.getImm();
-    }
-    if (opcode == GPU && func3 == 0) {
-      os << ", unit=" << std::dec << func2;
-    }
+  os << op_string(instr);
+  
+  int sep = 0;
+  if (instr.getRDType() != RegType::None) {
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << instr.getRDType() << std::dec << instr.getRDest();
+  }
+  for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {    
+    if (instr.getRSType(i) == RegType::None)
+      continue;
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
+  }
+  if (instr.hasImm()) {
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << "0x" << std::hex << instr.getImm();
+  }
+  if (opcode == Opcode::SYS_INST && func3 >= 5) {
+    // CSRs with immediate values
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << "0x" << std::hex << instr.getRSrc(0);
  }
  return os;
 }
 }

-Decoder::Decoder(const ArchDef&) {}
+Decoder::Decoder(const Arch&) {}

 std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {  
  auto instr = std::make_shared<Instr>();
@@ -434,7 +479,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    return nullptr;
  }

-  auto iType = op_it->second.iType;
+  auto iType = op_it->second;
  if (op == Opcode::FL || op == Opcode::FS) { 
    if (func3 != 0x2 && func3 != 0x3) {
      iType = InstType::V_TYPE;
@@ -442,57 +487,88 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  }

  switch (iType) {
-  case InstType::N_TYPE:
-    break;
-
  case InstType::R_TYPE:
-    if (op == Opcode::FCI) {
-      switch (func7) {      
+    switch (op) {
+    case Opcode::FCI:
+      switch (func7) {  
+      case 0x2c: // FSQRT.S
+      case 0x2d: // FSQRT.D
+        instr->setDestReg(rd, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
+        break;    
      case 0x50: // FLE.S, FLT.S, FEQ.S
      case 0x51: // FLE.D, FLT.D, FEQ.D
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::Float);
        break;
      case 0x60: // FCVT.W.D, FCVT.WU.D, FCVT.L.D, FCVT.LU.D
      case 0x61: // FCVT.WU.S, FCVT.W.S, FCVT.L.S, FCVT.LU.S
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::None);
        break;
      case 0x68: // FCVT.S.W, FCVT.S.WU, FCVT.S.L, FCVT.S.LU
      case 0x69: // FCVT.D.W, FCVT.D.WU, FCVT.D.L, FCVT.D.LU
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Integer);
-        instr->setSrcReg(rs2, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Integer);
+        instr->addSrcReg(rs2, RegType::None);
        break;
-      case 0x70: // FCLASS.S, FMV.X.W
+      case 0x70: // FCLASS.S, FMV.X.S
      case 0x71: // FCLASS.D, FMV.X.D        
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
        break;
-      case 0x78: // FMV.W.X
+      case 0x78: // FMV.S.X
      case 0x79: // FMV.D.X        
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Integer);
        break;
      default:
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Float);        
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::Float);        
        break;
      }
-    } else {
+      break;
+    case Opcode::EXT1:
+      switch (func7) {
+      case 0:
+        switch (func3) {         
+        case 0: // TMC
+        case 3: // JOIN
+          instr->addSrcReg(rs1, RegType::Integer);
+          break;
+        case 1: // WSPAWN        
+        case 4: // BAR
+        case 5: // PRED
+          instr->addSrcReg(rs1, RegType::Integer);
+          instr->addSrcReg(rs2, RegType::Integer);
+          break;
+        case 2: // SPLIT
+          instr->setDestReg(rd, RegType::Integer);
+          instr->addSrcReg(rs1, RegType::Integer);
+          break;
+        default:
+          std::abort();
+        }
+        break;
+      default:
+        std::abort();
+      }
+      break;
+    default:
      instr->setDestReg(rd, RegType::Integer);
-      instr->setSrcReg(rs1, RegType::Integer);
-      instr->setSrcReg(rs2, RegType::Integer);
+      instr->addSrcReg(rs1, RegType::Integer);
+      instr->addSrcReg(rs2, RegType::Integer);
+      break;
    }
    instr->setFunc3(func3);
    instr->setFunc7(func7);
    break;

  case InstType::I_TYPE: {
-    instr->setSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
    if (op == Opcode::FL) {
      instr->setDestReg(rd, RegType::Float);      
    } else {
@@ -503,15 +579,23 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    switch (op) {
    case Opcode::SYS_INST:
      if (func3 != 0) {
-        // RV32I: CSR*
-        instr->setDestReg(rd, RegType::Integer);
-      }
+        // RV32I: CSR
+        if (func3 >= 5) {
+          // rs1 holds zimm
+          instr->setSrcReg(0, rs1, RegType::None);
+        }        
+      } else {        
+        instr->setDestReg(rd, RegType::None);
+        instr->setSrcReg(0, rs1, RegType::None);
+      }      
      // uint12
      instr->setImm(code >> shift_rs2);
      break;
    case Opcode::FENCE:
      // uint12
      instr->setImm(code >> shift_rs2);
+      instr->setDestReg(rd, RegType::None);
+      instr->setSrcReg(0, rs1, RegType::None);
      break;
    case Opcode::I_INST:
    case Opcode::I_INST_W:
@@ -538,11 +622,11 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    }
  } break;
  case InstType::S_TYPE: {    
-    instr->setSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
    if (op == Opcode::FS) {
-      instr->setSrcReg(rs2, RegType::Float);
+      instr->addSrcReg(rs2, RegType::Float);
    } else {
-      instr->setSrcReg(rs2, RegType::Integer);
+      instr->addSrcReg(rs2, RegType::Integer);
    }
    instr->setFunc3(func3);
    auto imm = (func7 << width_reg) | rd;
@@ -550,8 +634,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  } break;

  case InstType::B_TYPE: {
-    instr->setSrcReg(rs1, RegType::Integer);
-    instr->setSrcReg(rs2, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs2, RegType::Integer);
    instr->setFunc3(func3);
    auto bit_11   = rd & 0x1;
    auto bits_4_1 = rd >> 1;
@@ -581,8 +665,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  case InstType::V_TYPE:
    switch (op) {
    case Opcode::VSET: {
-      instr->setDestVReg(rd);
-      instr->setSrcVReg(rs1);
+      instr->setDestReg(rd, RegType::Vector);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setFunc3(func3);
      if (func3 == 7) {
        instr->setImm(!(code >> shift_vset));
@@ -593,20 +677,20 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
          instr->setVediv((immed >> 4) & 0x3);
          instr->setVsew((immed >> 2) & 0x3);
        } else {
-          instr->setSrcVReg(rs2);
+          instr->addSrcReg(rs2, RegType::Vector);
        }
      } else {
-        instr->setSrcVReg(rs2);
+        instr->addSrcReg(rs2, RegType::Vector);
        instr->setVmask((code >> shift_func7) & 0x1);
        instr->setFunc6(func6);
      }
    } break;

    case Opcode::FL:
-      instr->setDestVReg(rd);
-      instr->setSrcVReg(rs1);
+      instr->setDestReg(rd, RegType::Vector);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setVlsWidth(func3);
-      instr->setSrcVReg(rs2);
+      instr->addSrcReg(rs2, RegType::Vector);
      instr->setVmask(code >> shift_func7);
      instr->setVmop((code >> shift_vmop) & mask_func3);
      instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -614,9 +698,9 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {

    case Opcode::FS:
      instr->setVs3(rd);
-      instr->setSrcVReg(rs1);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setVlsWidth(func3);
-      instr->setSrcVReg(rs2);
+      instr->addSrcReg(rs2, RegType::Vector);
      instr->setVmask(code >> shift_func7);
      instr->setVmop((code >> shift_vmop) & mask_func3);
      instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -627,16 +711,28 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    }
    break;
  case R4_TYPE:
-    if (op == Opcode::GPU) {
-      instr->setDestReg(rd, RegType::Integer);
-      instr->setSrcReg(rs1, RegType::Integer);
-      instr->setSrcReg(rs2, RegType::Integer);
-      instr->setSrcReg(rs3, RegType::Integer);
+    if (op == Opcode::EXT2) {
+      switch (func3) {
+      case 1:
+        switch (func2) {
+        case 0: // CMOV
+          instr->setDestReg(rd, RegType::Integer);
+          instr->addSrcReg(rs1, RegType::Integer);
+          instr->addSrcReg(rs2, RegType::Integer);
+          instr->addSrcReg(rs3, RegType::Integer);
+          break;
+        default:
+          std::abort();
+        }
+        break;
+      default:
+        std::abort();
+      }
    } else {
      instr->setDestReg(rd, RegType::Float);
-      instr->setSrcReg(rs1, RegType::Float);
-      instr->setSrcReg(rs2, RegType::Float);
-      instr->setSrcReg(rs3, RegType::Float);
+      instr->addSrcReg(rs1, RegType::Float);
+      instr->addSrcReg(rs2, RegType::Float);
+      instr->addSrcReg(rs3, RegType::Float);
    }
    instr->setFunc2(func2);
    instr->setFunc3(func3);
--- a/sim/simx/decode.h
+++ b/sim/simx/decode.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <vector>
@@ -5,12 +18,12 @@

 namespace vortex {

-class ArchDef;
+class Arch;
 class Instr;

 class Decoder {
 public:
-  Decoder(const ArchDef &);    
+  Decoder(const Arch &);    
  
  std::shared_ptr<Instr> decode(uint32_t code) const;
 };
--- a/sim/simx/dispatcher.h
+++ b/sim/simx/dispatcher.h
@@ -0,0 +1,141 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pipeline.h"
+#include <queue>
+
+namespace vortex {
+
+class Dispatcher : public SimObject<Dispatcher> {
+public:
+    std::vector<SimPort<pipeline_trace_t*>> Outputs;
+
+    Dispatcher(const SimContext& ctx, const Arch& arch, uint32_t buf_size, uint32_t block_size, uint32_t num_lanes) 
+        : SimObject<Dispatcher>(ctx, "Dispatcher") 
+        , Outputs(ISSUE_WIDTH, this)
+        , Inputs_(ISSUE_WIDTH, this)
+        , arch_(arch)
+        , queues_(ISSUE_WIDTH, std::queue<pipeline_trace_t*>())
+        , buf_size_(buf_size)        
+        , block_size_(block_size)        
+        , num_lanes_(num_lanes)        
+        , batch_count_(ISSUE_WIDTH / block_size)
+        , pid_count_(arch.num_threads() / num_lanes)
+        , batch_idx_(0)
+        , start_p_(block_size, 0)
+    {}
+    
+    virtual ~Dispatcher() {}
+
+    virtual void reset() {
+        batch_idx_ = 0;
+        for (uint32_t b = 0; b < block_size_; ++b) {
+            start_p_.at(b) = 0;
+        }
+    }
+
+    virtual void tick() {
+        for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+            auto& queue = queues_.at(i);
+            if (queue.empty())
+                continue;
+            auto trace = queue.front();
+            Inputs_.at(i).send(trace, 1);
+            queue.pop();
+        }
+
+        uint32_t block_sent = 0;
+        for (uint32_t b = 0; b < block_size_; ++b) {
+            uint32_t i = batch_idx_ * block_size_ + b;
+            auto& input = Inputs_.at(i);            
+            if (input.empty()) {
+                ++block_sent;
+                continue;
+            }
+            auto& output = Outputs.at(i);
+            auto trace = input.front();
+            if (pid_count_ != 1) {
+                auto start_p = start_p_.at(b);
+                if (start_p == -1) {
+                    ++block_sent;
+                    continue;       
+                }             
+                int start(-1), end(-1);
+                for (uint32_t j = start_p * num_lanes_, n = arch_.num_threads(); j < n; ++j) {
+                    if (!trace->tmask.test(j))
+                        continue;
+                    if (start == -1)
+                        start = j;
+                    end = j;
+                }                
+                start /= num_lanes_;
+                end /= num_lanes_;
+                auto new_trace = new pipeline_trace_t(*trace);
+                new_trace->tmask.reset();
+                for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
+                    new_trace->tmask[j] = trace->tmask[j];
+                }                
+                new_trace->pid = start;
+                new_trace->sop = (start_p == 0);
+                if (start == end) {
+                    new_trace->eop = 1;
+                    start_p_.at(b) = -1;
+                    input.pop();
+                    ++block_sent;
+                    delete trace;
+                } else {
+                    new_trace->eop = 0;
+                    start_p_.at(b) = start + 1;
+                }                
+                output.send(new_trace, 1);
+                DT(3, "pipeline-dispatch: " << *new_trace);
+            } else {
+                trace->pid = 0;
+                input.pop();
+                output.send(trace, 1);
+                DT(3, "pipeline-dispatch: " << *trace);
+                ++block_sent;
+            }            
+        }
+        if (block_sent == block_size_) {
+            batch_idx_ = (batch_idx_ + 1) % batch_count_;
+            for (uint32_t b = 0; b < block_size_; ++b) {
+                start_p_.at(b) = 0;
+            }
+        }
+    };
+
+    bool push(uint32_t issue_index, pipeline_trace_t* trace) {
+        auto& queue = queues_.at(issue_index);
+        if (queue.size() >= buf_size_)
+            return false;
+        queue.push(trace);        
+        return true;
+    }
+
+private:
+    std::vector<SimPort<pipeline_trace_t*>> Inputs_;
+    const Arch& arch_;
+    std::vector<std::queue<pipeline_trace_t*>> queues_;
+    uint32_t buf_size_;
+    uint32_t block_size_;
+    uint32_t num_lanes_;
+    uint32_t batch_count_;
+    uint32_t pid_count_;
+    uint32_t batch_idx_;
+    std::vector<int> start_p_;
+};
+
+}
--- a/sim/simx/exe_unit.cpp
+++ b/sim/simx/exe_unit.cpp
@@ -0,0 +1,329 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "exe_unit.h"
+#include <iostream>
+#include <iomanip>
+#include <string.h>
+#include <assert.h>
+#include <util.h>
+#include "debug.h"
+#include "core.h"
+#include "constants.h"
+#include "cache_sim.h"
+
+using namespace vortex;
+
+AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
+    
+void AluUnit::tick() {    
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        auto& input = Inputs.at(i);
+        if (input.empty()) 
+            continue;
+        auto& output = Outputs.at(i);
+        auto trace = input.front();
+        switch (trace->alu_type) {
+        case AluType::ARITH:        
+        case AluType::BRANCH:
+        case AluType::SYSCALL:
+        case AluType::IMUL:
+            output.send(trace, LATENCY_IMUL+1);
+            break;
+        case AluType::IDIV:
+            output.send(trace, XLEN+1);
+            break;
+        default:
+            std::abort();
+        }
+        DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
+        if (trace->eop && trace->fetch_stall) {
+            assert(core_->stalled_warps_.test(trace->wid));
+            core_->stalled_warps_.reset(trace->wid);
+        }
+        auto time = input.pop();
+        core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
+    
+void FpuUnit::tick() {
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        auto& input = Inputs.at(i);
+        if (input.empty()) 
+            continue;
+        auto& output = Outputs.at(i);
+        auto trace = input.front();
+        switch (trace->fpu_type) {
+        case FpuType::FNCP:
+            output.send(trace, 2);
+            break;
+        case FpuType::FMA:
+            output.send(trace, LATENCY_FMA+1);
+            break;
+        case FpuType::FDIV:
+            output.send(trace, LATENCY_FDIV+1);
+            break;
+        case FpuType::FSQRT:
+            output.send(trace, LATENCY_FSQRT+1);
+            break;
+        case FpuType::FCVT:
+            output.send(trace, LATENCY_FCVT+1);
+            break;
+        default:
+            std::abort();
+        }    
+        DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
+        auto time = input.pop();
+        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
+    : ExeUnit(ctx, core, "LSU")
+    , pending_rd_reqs_(LSUQ_SIZE)
+    , num_lanes_(NUM_LSU_LANES)     
+    , pending_loads_(0)
+    , fence_lock_(false)
+    , input_idx_(0)
+{}
+
+void LsuUnit::reset() {
+    pending_rd_reqs_.clear();
+    pending_loads_ = 0;
+    fence_lock_ = false;
+}
+
+void LsuUnit::tick() {    
+    core_->perf_stats_.load_latency += pending_loads_;
+
+    // handle dcache response    
+    for (uint32_t t = 0; t < num_lanes_; ++t) {
+        auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
+        if (dcache_rsp_port.empty())
+            continue;
+        auto& mem_rsp = dcache_rsp_port.front();
+        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
+        auto trace = entry.trace;
+        DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type 
+            << ", tid=" << t << ", " << *trace);  
+        assert(entry.count);
+        --entry.count; // track remaining addresses 
+        if (0 == entry.count) {
+            int iw = trace->wid % ISSUE_WIDTH;
+            auto& output = Outputs.at(iw);
+            output.send(trace, 1);
+            pending_rd_reqs_.release(mem_rsp.tag);
+        } 
+        dcache_rsp_port.pop();
+        --pending_loads_;
+    }
+
+    // handle shared memory response
+    for (uint32_t t = 0; t < num_lanes_; ++t) {
+        auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
+        if (smem_rsp_port.empty())
+            continue;
+        auto& mem_rsp = smem_rsp_port.front();
+        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
+        auto trace = entry.trace;
+        DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
+        assert(entry.count);
+        --entry.count; // track remaining addresses 
+        if (0 == entry.count) {
+            int iw = trace->wid % ISSUE_WIDTH;
+            auto& output = Outputs.at(iw);
+            output.send(trace, 1);
+            pending_rd_reqs_.release(mem_rsp.tag);
+        } 
+        smem_rsp_port.pop();  
+        --pending_loads_;
+    }
+
+    if (fence_lock_) {
+        // wait for all pending memory operations to complete
+        if (!pending_rd_reqs_.empty())
+            return;
+        int iw = fence_state_->wid % ISSUE_WIDTH;
+        auto& output = Outputs.at(iw);
+        output.send(fence_state_, 1);
+        fence_lock_ = false;
+        DT(3, "fence-unlock: " << fence_state_);
+    }    
+
+    // check input queue
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        int iw = (input_idx_ + i) % ISSUE_WIDTH;
+        auto& input = Inputs.at(iw);
+        if (input.empty())
+            continue;
+        auto& output = Outputs.at(iw);
+        auto trace = input.front();
+        auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
+
+        auto t0 = trace->pid * num_lanes_;
+
+        if (trace->lsu_type == LsuType::FENCE) {
+            // schedule fence lock
+            fence_state_ = trace;
+            fence_lock_ = true;        
+            DT(3, "fence-lock: " << *trace);
+            // remove input
+            auto time = input.pop(); 
+            core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+            break;
+        }
+
+        // check pending queue capacity    
+        if (pending_rd_reqs_.full()) {
+            if (!trace->log_once(true)) {
+                DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
+            }
+            break;
+        } else {
+            trace->log_once(false);
+        }
+        
+        bool is_write = (trace->lsu_type == LsuType::STORE);
+
+        // duplicates detection
+        bool is_dup = false;
+        if (trace->tmask.test(t0)) {
+            uint64_t addr_mask = sizeof(uint32_t)-1;
+            uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
+            uint32_t matches = 1;
+            for (uint32_t t = 1; t < num_lanes_; ++t) {
+                if (!trace->tmask.test(t0 + t))
+                    continue;
+                auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
+                matches += (addr0 == mem_addr);
+            }
+            is_dup = (matches == trace->tmask.count());
+        }
+
+        uint32_t addr_count;
+        if (is_dup) {
+            addr_count = 1;
+        } else {
+            addr_count = trace->tmask.count();
+        }
+
+        auto tag = pending_rd_reqs_.allocate({trace, addr_count});
+
+        for (uint32_t t = 0; t < num_lanes_; ++t) {
+            if (!trace->tmask.test(t0 + t))
+                continue;
+            
+            auto& dcache_req_port = core_->dcache_req_ports.at(t);
+            auto mem_addr = trace_data->mem_addrs.at(t);
+            auto type = core_->get_addr_type(mem_addr.addr);
+
+            MemReq mem_req;
+            mem_req.addr  = mem_addr.addr;
+            mem_req.write = is_write;
+            mem_req.type  = type; 
+            mem_req.tag   = tag;
+            mem_req.cid   = trace->cid;
+            mem_req.uuid  = trace->uuid;        
+                
+            dcache_req_port.send(mem_req, 2);
+            DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag 
+                << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
+
+            ++pending_loads_;
+            ++core_->perf_stats_.loads;        
+            if (is_dup)
+                break;
+        }
+
+        // do not wait on writes
+        if (is_write) {
+            pending_rd_reqs_.release(tag);
+            output.send(trace, 1);
+            ++core_->perf_stats_.stores;
+        }
+
+        // remove input
+        auto time = input.pop();
+        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+
+        break; // single block
+    }
+    ++input_idx_;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+SfuUnit::SfuUnit(const SimContext& ctx, Core* core) 
+    : ExeUnit(ctx, core, "SFU")
+    , input_idx_(0)
+{}
+    
+void SfuUnit::tick() {
+    // check input queue
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        int iw = (input_idx_ + i) % ISSUE_WIDTH;        
+        auto& input = Inputs.at(iw);
+        if (input.empty())
+            continue;
+        auto& output = Outputs.at(iw);
+        auto trace = input.front();
+        auto sfu_type = trace->sfu_type;
+        bool release_warp = trace->fetch_stall;
+
+        switch  (sfu_type) {
+        case SfuType::TMC: 
+        case SfuType::WSPAWN:
+        case SfuType::SPLIT:
+        case SfuType::JOIN:
+        case SfuType::PRED:
+        case SfuType::CSRRW:
+        case SfuType::CSRRS:
+        case SfuType::CSRRC:
+            output.send(trace, 1);
+            break;
+        case SfuType::BAR: {
+            output.send(trace, 1);
+            auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
+            if (trace->eop) {
+                core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
+            }
+            release_warp = false;
+        }   break;
+        case SfuType::CMOV:
+            output.send(trace, 3);
+            break;
+        default:
+            std::abort();
+        }
+
+        DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
+        if (trace->eop && release_warp)  {
+            assert(core_->stalled_warps_.test(trace->wid));
+            core_->stalled_warps_.reset(trace->wid);
+        }
+
+        auto time = input.pop();
+        auto stalls = (SimPlatform::instance().cycles() - time);
+
+        core_->perf_stats_.sfu_stalls += stalls;
+
+        break; // single block
+    }
+    ++input_idx_;
+}
--- a/sim/simx/exe_unit.h
+++ b/sim/simx/exe_unit.h
@@ -1,8 +1,21 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
 #include "pipeline.h"
-#include "cache.h"
+#include "cache_sim.h"

 namespace vortex {

@@ -10,13 +23,13 @@ class Core;

 class ExeUnit : public SimObject<ExeUnit> {
 public:
-    SimPort<pipeline_trace_t*> Input;
-    SimPort<pipeline_trace_t*> Output;
+    std::vector<SimPort<pipeline_trace_t*>> Inputs;
+    std::vector<SimPort<pipeline_trace_t*>> Outputs;

    ExeUnit(const SimContext& ctx, Core* core, const char* name) 
        : SimObject<ExeUnit>(ctx, name) 
-        , Input(this)
-        , Output(this)
+        , Inputs(ISSUE_WIDTH, this)
+        , Outputs(ISSUE_WIDTH, this)
        , core_(core)
    {}
    
@@ -32,32 +45,6 @@ protected:

 ///////////////////////////////////////////////////////////////////////////////

-class NopUnit : public ExeUnit {
-public:
-    NopUnit(const SimContext& ctx, Core*);
-    
-    void tick();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-class LsuUnit : public ExeUnit {
-private:    
-    uint32_t num_threads_;
-    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
-    pipeline_trace_t* fence_state_;
-    bool fence_lock_;
-
-public:
-    LsuUnit(const SimContext& ctx, Core*);
-
-    void reset();
-
-    void tick();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
 class AluUnit : public ExeUnit {
 public:
    AluUnit(const SimContext& ctx, Core*);
@@ -67,15 +54,6 @@ public:

 ///////////////////////////////////////////////////////////////////////////////

-class CsrUnit : public ExeUnit {
-public:
-    CsrUnit(const SimContext& ctx, Core*);
-    
-    void tick();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
 class FpuUnit : public ExeUnit {
 public:
    FpuUnit(const SimContext& ctx, Core*);
@@ -85,19 +63,37 @@ public:

 ///////////////////////////////////////////////////////////////////////////////

-class GpuUnit : public ExeUnit {
-private:
-    uint32_t num_threads_;
-    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
-
-    bool processTexRequest(pipeline_trace_t* trace);
-    
+class LsuUnit : public ExeUnit {
 public:
-    GpuUnit(const SimContext& ctx, Core*);
+    LsuUnit(const SimContext& ctx, Core*);

    void reset();
+
+    void tick();
+
+private:    
+    struct pending_req_t {
+      pipeline_trace_t* trace;
+      uint32_t count;
+    };
+    HashTable<pending_req_t> pending_rd_reqs_;    
+    uint32_t num_lanes_;
+    pipeline_trace_t* fence_state_;
+    uint64_t pending_loads_;
+    bool fence_lock_;
+    uint32_t input_idx_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class SfuUnit : public ExeUnit {
+public:
+    SfuUnit(const SimContext& ctx, Core*);
    
    void tick();
+
+private:
+  uint32_t input_idx_;
 };

 }
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
--- a/sim/simx/exeunit.cpp
+++ b/sim/simx/exeunit.cpp
@@ -1,383 +0,0 @@
-#include "exeunit.h"
-#include <iostream>
-#include <iomanip>
-#include <string.h>
-#include <assert.h>
-#include <util.h>
-#include "debug.h"
-#include "core.h"
-#include "constants.h"
-
-using namespace vortex;
-
-NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
-    
-void NopUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    Output.send(trace, 1);
-    Input.pop();
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
-    : ExeUnit(ctx, core, "LSU")
-    , num_threads_(core->arch().num_threads()) 
-    , pending_rd_reqs_(LSUQ_SIZE)
-    , fence_lock_(false)
-{}
-
-void LsuUnit::reset() {
-    pending_rd_reqs_.clear();
-    fence_lock_ = false;
-}
-
-void LsuUnit::tick() {
-    // handle dcache response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
-        if (dcache_rsp_port.empty())
-            continue;
-        auto& mem_rsp = dcache_rsp_port.front();
-        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
-        auto trace = entry.first;
-        DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
-            << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_rd_reqs_.release(mem_rsp.tag);
-        } 
-        dcache_rsp_port.pop();  
-    }
-
-    // handle shared memory response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
-        if (smem_rsp_port.empty())
-            continue;
-        auto& mem_rsp = smem_rsp_port.front();
-        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
-        auto trace = entry.first;
-        DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
-            << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_rd_reqs_.release(mem_rsp.tag);
-        } 
-        smem_rsp_port.pop();  
-    }
-
-    if (fence_lock_) {
-        // wait for all pending memory operations to complete
-        if (!pending_rd_reqs_.empty())
-            return;
-        Output.send(fence_state_, 1);
-        fence_lock_ = false;
-        DT(3, "fence-unlock: " << fence_state_);
-    }
-
-    // check input queue
-    if (Input.empty())
-        return;
-
-    auto trace = Input.front();
-
-    if (trace->lsu.type == LsuType::FENCE) {
-        // schedule fence lock
-        fence_state_ = trace;
-        fence_lock_ = true;        
-        DT(3, "fence-lock: " << *trace);
-        // remove input
-        auto time = Input.pop(); 
-        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
-        return;
-    }
-
-    // check pending queue capacity    
-    if (pending_rd_reqs_.full()) {
-        if (!trace->suspend()) {
-            DT(3, "*** lsu-queue-stall: " << *trace);
-        }
-        return;
-    } else {
-        trace->resume();
-    }
-    
-    bool is_write = (trace->lsu.type == LsuType::STORE);
-
-    // duplicates detection
-    bool is_dup = false;
-    if (trace->tmask.test(0)) {
-        uint64_t addr_mask = sizeof(uint32_t)-1;
-        uint32_t addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
-        uint32_t matches = 1;
-        for (uint32_t t = 1; t < num_threads_; ++t) {
-            if (!trace->tmask.test(t))
-                continue;
-            auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
-            matches += (addr0 == mem_addr);
-        }
-        is_dup = (matches == trace->tmask.count());
-    }
-
-    uint32_t valid_addrs = 0;
-    if (is_dup) {
-        valid_addrs = 1;
-    } else {
-        for (auto& mem_addr : trace->mem_addrs) {
-            valid_addrs += mem_addr.size();
-        }
-    }
-
-    auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
-
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        if (!trace->tmask.test(t))
-            continue;
-        
-        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);        
-        auto mem_addr = trace->mem_addrs.at(t).at(0);
-        auto type = get_addr_type(mem_addr.addr, mem_addr.size);
-
-        MemReq mem_req;
-        mem_req.addr  = mem_addr.addr;
-        mem_req.write = is_write;
-        mem_req.non_cacheable = (type == AddrType::IO); 
-        mem_req.tag   = tag;
-        mem_req.core_id = trace->cid;
-        mem_req.uuid = trace->uuid;
-        
-        if (type == AddrType::Shared) {
-            core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
-            DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
-        } else {            
-            dcache_req_port.send(mem_req, 2);
-            DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
-        }        
-        
-        if (is_dup)
-            break;
-    }
-
-    // do not wait on writes
-    if (is_write) {        
-        pending_rd_reqs_.release(tag);
-        Output.send(trace, 1);
-    }
-
-    // remove input
-    auto time = Input.pop();
-    core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
-    
-void AluUnit::tick() {    
-    if (Input.empty())
-        return;
-    auto trace = Input.front();    
-    switch (trace->alu.type) {
-    case AluType::ARITH:        
-    case AluType::BRANCH:
-    case AluType::SYSCALL:
-    case AluType::CMOV:
-        Output.send(trace, 1);
-        break;
-    case AluType::IMUL:
-        Output.send(trace, LATENCY_IMUL+1);
-        break;
-    case AluType::IDIV:
-        Output.send(trace, XLEN+1);
-        break;
-    default:
-        std::abort();
-    }
-    DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
-    if (trace->fetch_stall) {
-        core_->stalled_warps_.reset(trace->wid);
-    }
-    auto time = Input.pop();
-    core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
-    
-void CsrUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    Output.send(trace, 1);
-    auto time = Input.pop();
-    core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
-    DT(3, "pipeline-execute: op=CSR, " << *trace);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
-    
-void FpuUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    switch (trace->fpu.type) {
-    case FpuType::FNCP:
-        Output.send(trace, 2);
-        break;
-    case FpuType::FMA:
-        Output.send(trace, LATENCY_FMA+1);
-        break;
-    case FpuType::FDIV:
-        Output.send(trace, LATENCY_FDIV+1);
-        break;
-    case FpuType::FSQRT:
-        Output.send(trace, LATENCY_FSQRT+1);
-        break;
-    case FpuType::FCVT:
-        Output.send(trace, LATENCY_FCVT+1);
-        break;
-    default:
-        std::abort();
-    }    
-    DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
-    auto time = Input.pop();
-    core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-GpuUnit::GpuUnit(const SimContext& ctx, Core* core) 
-    : ExeUnit(ctx, core, "GPU")
-    , num_threads_(core->arch().num_threads()) 
-    , pending_tex_reqs_(TEXQ_SIZE)
-{}
-
-void GpuUnit::reset() {
-    pending_tex_reqs_.clear();
-}
-    
-void GpuUnit::tick() {
-#ifdef EXT_TEX_ENABLE
-    // handle memory response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
-        if (dcache_rsp_port.empty())
-            continue;
-        auto& mem_rsp = dcache_rsp_port.front();
-        auto& entry = pending_tex_reqs_.at(mem_rsp.tag);  
-        auto trace = entry.first;
-        DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_tex_reqs_.release(mem_rsp.tag);
-        }   
-        dcache_rsp_port.pop();
-    }
-#endif
-
-    // check input queue
-    if (Input.empty())
-        return;
-
-    auto trace = Input.front();
-
-    bool issued = false;
-
-    switch  (trace->gpu.type) {
-    case GpuType::TMC:
-        Output.send(trace, 1);
-        core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
-        issued = true;
-        break;
-    case GpuType::WSPAWN:
-        Output.send(trace, 1);
-        core_->active_warps_ = trace->gpu.active_warps;        
-        issued = true;
-        break;
-    case GpuType::SPLIT:
-    case GpuType::JOIN:
-        Output.send(trace, 1);
-        issued = true;
-        break;
-    case GpuType::BAR:
-        Output.send(trace, 1);
-        if (trace->gpu.active_warps != 0) 
-            core_->active_warps_ |= trace->gpu.active_warps;
-        else
-            core_->active_warps_.reset(trace->wid);
-        issued = true;
-        break;
-    case GpuType::TEX:
-        if (this->processTexRequest(trace))
-           issued = true;
-        break;
-    default:
-        std::abort();
-    }
-
-    if (issued) {    
-        DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
-        if (trace->fetch_stall)  {
-            core_->stalled_warps_.reset(trace->wid);
-        }
-        auto time = Input.pop();
-        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
-    }
-}
-
-bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {    
-    // check pending queue capacity    
-    if (pending_tex_reqs_.full()) {
-        if (!trace->suspend()) {
-            DT(3, "*** tex-queue-stall: " << *trace);
-        }
-        return false;
-    } else {
-        trace->resume();
-    }
-
-    // send memory request
-
-    uint32_t valid_addrs = 0;
-    for (auto& mem_addr : trace->mem_addrs) {
-        valid_addrs += mem_addr.size();
-    }
-
-    auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
-
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        if (!trace->tmask.test(t))
-            continue;
-
-        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
-        for (auto& mem_addr : trace->mem_addrs.at(t)) {
-            MemReq mem_req;
-            mem_req.addr  = mem_addr.addr;
-            mem_req.write = (trace->lsu.type == LsuType::STORE);
-            mem_req.tag   = tag;
-            mem_req.core_id = core_->id();
-            mem_req.uuid = trace->uuid;
-            dcache_req_port.send(mem_req, 3);
-            DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", tid=" << t << ", "<< trace);
-            ++ core_->perf_stats_.tex_reads;
-            ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
-        }
-    }
-
-    return true;
-}
--- a/sim/simx/ibuffer.h
+++ b/sim/simx/ibuffer.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "pipeline.h"
@@ -6,10 +19,6 @@
 namespace vortex {

 class IBuffer {
-private:
-    std::queue<pipeline_trace_t*> entries_;
-    uint32_t capacity_;
-
 public:    
    IBuffer(uint32_t size) 
        : capacity_(size)
@@ -39,6 +48,10 @@ public:
        std::queue<pipeline_trace_t*> empty;
        std::swap(entries_, empty );
    }
+
+private:
+    std::queue<pipeline_trace_t*> entries_;
+    uint32_t capacity_;
 };

 }
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "types.h"
@@ -7,7 +20,7 @@ namespace vortex {
 class Warp;

 enum Opcode {   
-  NOP       = 0,    
+  NONE      = 0,    
  R_INST    = 0x33,
  L_INST    = 0x3,
  I_INST    = 0x13,
@@ -19,6 +32,7 @@ enum Opcode {
  JALR_INST = 0x67,
  SYS_INST  = 0x73,
  FENCE     = 0x0f,
+  AMO       = 0x2f,
  // F Extension
  FL        = 0x7,
  FS        = 0x27,
@@ -26,19 +40,20 @@ enum Opcode {
  FMADD     = 0x43,
  FMSUB     = 0x47,
  FMNMSUB   = 0x4b,
-  FMNMADD   = 0x4f,
-  // Vector Extension  
-  VSET      = 0x57,
-  // GPGPU Extension
-  GPGPU     = 0x6b,
-  GPU       = 0x5b,
-  // RV64 Standard Extensions
+  FMNMADD   = 0x4f,  
+  // RV64 Standard Extension
  R_INST_W  = 0x3b,
  I_INST_W  = 0x1b,
+  // Vector Extension  
+  VSET      = 0x57,
+  // Custom Extensions
+  EXT1      = 0x0b,
+  EXT2      = 0x2b,
+  EXT3      = 0x5b,
+  EXT4      = 0x7b
 };

-enum InstType { 
-  N_TYPE, 
+enum InstType {
  R_TYPE, 
  I_TYPE, 
  S_TYPE, 
@@ -52,25 +67,45 @@ enum InstType {
 class Instr {
 public:
  Instr() 
-    : opcode_(Opcode::NOP)
+    : opcode_(Opcode::NONE)
    , num_rsrcs_(0)
    , has_imm_(false)
    , rdest_type_(RegType::None)
+    , imm_(0)
    , rdest_(0)
    , func2_(0)
    , func3_(0)
    , func6_(0)
-    , func7_(0) {
+    , func7_(0)
+    , vmask_(0)
+    , vlsWidth_(0)
+    , vMop_(0)
+    , vNf_(0)
+    , vs3_(0)
+    , vlmul_(0)
+    , vsew_(0)
+    , vediv_(0)   {
    for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
       rsrc_type_[i] = RegType::None;
+       rsrc_[i] = 0;
    }
  }

  void setOpcode(Opcode opcode)  { opcode_ = opcode; }
-  void setDestReg(uint32_t destReg, RegType type) { rdest_type_ = type; rdest_ = destReg; }
-  void setSrcReg(uint32_t srcReg, RegType type) { rsrc_type_[num_rsrcs_] = type; rsrc_[num_rsrcs_++] = srcReg; }
-  void setDestVReg(uint32_t destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
-  void setSrcVReg(uint32_t srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg;  }
+  void setDestReg(uint32_t destReg, RegType type) { 
+    rdest_type_ = type; 
+    rdest_ = destReg; 
+  }
+  void addSrcReg(uint32_t srcReg, RegType type) { 
+    rsrc_type_[num_rsrcs_] = type; 
+    rsrc_[num_rsrcs_] = srcReg; 
+    ++num_rsrcs_;
+  }
+  void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) { 
+    rsrc_type_[index] = type; 
+    rsrc_[index] = srcReg; 
+    num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1); 
+  }
  void setFunc2(uint32_t func2) { func2_ = func2; }
  void setFunc3(uint32_t func3) { func3_ = func3; }
  void setFunc7(uint32_t func7) { func7_ = func7; }
@@ -85,17 +120,17 @@ public:
  void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; }
  void setFunc6(uint32_t func6) { func6_ = func6; }

-  Opcode getOpcode() const { return opcode_; }
+  Opcode   getOpcode() const { return opcode_; }
  uint32_t getFunc2() const { return func2_; }
  uint32_t getFunc3() const { return func3_; }
  uint32_t getFunc6() const { return func6_; }
  uint32_t getFunc7() const { return func7_; }
  uint32_t getNRSrc() const { return num_rsrcs_; }
  uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
-  RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
+  RegType  getRSType(uint32_t i) const { return rsrc_type_[i]; }
  uint32_t getRDest() const { return rdest_; }  
-  RegType getRDType() const { return rdest_type_; }  
-  bool hasImm() const { return has_imm_; }
+  RegType  getRDType() const { return rdest_type_; }  
+  bool     hasImm() const { return has_imm_; }
  uint32_t getImm() const { return imm_; }
  uint32_t getVlsWidth() const { return vlsWidth_; }
  uint32_t getVmop() const { return vMop_; }
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -1,98 +1,132 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <iomanip>
 #include <string>
 #include <sstream>
 #include <fstream>
 #include <stdlib.h>
+#include <unistd.h>
 #include <sys/stat.h>
 #include "processor.h"
-#include "archdef.h"
 #include "mem.h"
 #include "constants.h"
 #include <util.h>
-#include "args.h"
 #include "core.h"

 using namespace vortex;

+static void show_usage() {
+   std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-r: riscv-test] [-s: stats] [-h: help] <program>" << std::endl;
+}
+
+uint32_t num_threads = NUM_THREADS;
+uint32_t num_warps = NUM_WARPS;
+uint32_t num_cores = NUM_CORES;
+uint32_t num_clusters = NUM_CLUSTERS;
+bool showStats = false;;
+bool riscv_test = false;
+const char* program = nullptr;
+
+static void parse_args(int argc, char **argv) {
+  	int c;
+  	while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
+    	switch (c) {
+      case 't':
+        num_threads = atoi(optarg);
+        break;
+      case 'w':
+        num_warps = atoi(optarg);
+        break;
+		  case 'c':
+        num_cores = atoi(optarg);
+        break;
+		  case 'g':
+        num_clusters = atoi(optarg);
+        break;
+      case 'r':
+        riscv_test = true;
+        break;
+      case 's':
+        showStats = true;
+        break;
+    	case 'h':
+    	case '?':
+      		show_usage();
+      		exit(0);
+    		break;
+    	default:
+      		show_usage();
+      		exit(-1);
+    	}
+	}
+
+	if (optind < argc) {
+		program = argv[optind];
+    std::cout << "Running " << program << "..." << std::endl;
+	} else {
+		show_usage();
+    exit(-1);
+	}
+}
+
 int main(int argc, char **argv) {
  int exitcode = 0;

-  std::string imgFileName;
-  int num_cores(NUM_CORES * NUM_CLUSTERS);
-  int num_warps(NUM_WARPS);
-  int num_threads(NUM_THREADS);  
-  bool showHelp(false);
-  bool showStats(false);
-  bool riscv_test(false);
+  parse_args(argc, argv);

-  // parse the command line arguments
-  CommandLineArgFlag fh("-h", "--help", "show command line options", showHelp);
-  CommandLineArgSetter<std::string> fi("-i", "--image", "program binary", imgFileName);
-  CommandLineArgSetter<int> fc("-c", "--cores", "number of cores", num_cores);
-  CommandLineArgSetter<int> fw("-w", "--warps", "number  of warps", num_warps);
-  CommandLineArgSetter<int> ft("-t", "--threads", "number of threads", num_threads);
-  CommandLineArgFlag fr("-r", "--riscv", "enable riscv tests", riscv_test);
-  CommandLineArgFlag fs("-s", "--stats", "show stats", showStats);
-
-  CommandLineArg::readArgs(argc - 1, argv + 1);
-
-  if (showHelp || imgFileName.empty()) {
-    std::cout << "Vortex emulator command line arguments:\n"
-                 "  -i, --image <filename> Program RAM image\n"
-                 "  -c, --cores <num> Number of cores\n"
-                 "  -w, --warps <num> Number of warps\n"
-                 "  -t, --threads <num> Number of threads\n"
-                 "  -r, --riscv riscv test\n"
-                 "  -s, --stats Print stats on exit.\n";
-    return 0;
-  }
-
-  std::cout << "Running " << imgFileName << "..." << std::endl;
-  
  {
    // create processor configuation
-    ArchDef arch(num_cores, num_warps, num_threads);
+    Arch arch(num_threads, num_warps, num_cores, num_clusters);

    // create memory module
    RAM ram(RAM_PAGE_SIZE);

+    // create processor
+    Processor processor(arch);
+  
+    // attach memory module
+    processor.attach_ram(&ram); 
+
+	  // setup base DCRs
+    const uint64_t startup_addr(STARTUP_ADDR);
+    processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
+  #if (XLEN == 64)
+    processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
+  #endif
+	  processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
+
    // load program
-    {
-      std::string program_ext(fileExtension(imgFileName.c_str()));
+    {      
+      std::string program_ext(fileExtension(program));
      if (program_ext == "bin") {
-        ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
+        ram.loadBinImage(program, startup_addr);
      } else if (program_ext == "hex") {
-        ram.loadHexImage(imgFileName.c_str());
+        ram.loadHexImage(program);
      } else {
        std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
        return -1;
      }
    }

-    // create processor
-    Processor processor(arch);
-  
-    // attach memory module
-    processor.attach_ram(&ram);   
-
    // run simulation
-    exitcode = processor.run();
+    exitcode = processor.run(riscv_test);
+  }   

+  if (exitcode != 0) {
+    std::cout << "*** error: exitcode=" << exitcode << std::endl;
  } 

-  if (riscv_test) {
-    if (1 == exitcode) {
-      std::cout << "Passed." << std::endl;
-      exitcode = 0;
-    } else {
-      std::cout << "Failed." << std::endl;
-    }
-  } else {
-    if (exitcode != 0) {
-      std::cout << "*** error: exitcode=" << exitcode << std::endl;
-    }
-  }  
-
  return exitcode;
 }
--- a/sim/simx/mem_sim.cpp
+++ b/sim/simx/mem_sim.cpp
@@ -1,4 +1,17 @@
-#include "memsim.h"
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mem_sim.h"
 #include <vector>
 #include <queue>
 #include <stdlib.h>
@@ -83,7 +96,7 @@ public:
            mem_req.addr,
            mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
            std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
-            mem_req.core_id
+            mem_req.cid
        );

        if (!dram_->send(dram_req))
--- a/sim/simx/mem_sim.h
+++ b/sim/simx/mem_sim.h
@@ -1,8 +1,20 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
 #include "types.h"
-#include <vector>

 namespace vortex {

--- a/sim/simx/operand.h
+++ b/sim/simx/operand.h
@@ -0,0 +1,61 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pipeline.h"
+#include <queue>
+
+namespace vortex {
+
+class Operand : public SimObject<Operand> {
+public:
+    SimPort<pipeline_trace_t*> Input;
+    SimPort<pipeline_trace_t*> Output;
+
+    Operand(const SimContext& ctx) 
+        : SimObject<Operand>(ctx, "Operand") 
+        , Input(this)
+        , Output(this)
+    {}
+    
+    virtual ~Operand() {}
+
+    virtual void reset() {}
+
+    virtual void tick() {
+        if (Input.empty())
+            return;
+        auto trace = Input.front();
+
+        int delay = 1;
+        for (int i = 0; i < MAX_NUM_REGS; ++i) {
+            bool is_iregs = trace->used_iregs.test(i);
+            bool is_fregs = trace->used_fregs.test(i);
+            bool is_vregs = trace->used_vregs.test(i);
+            if (is_iregs || is_fregs || is_vregs) {
+                if (is_iregs && i == 0)
+                    continue;
+                ++delay;
+            }
+        }
+
+        Output.send(trace, delay);
+        
+        DT(3, "pipeline-operands: " << *trace);
+
+        Input.pop();
+    };
+};
+
+}
--- a/sim/simx/pipeline.h
+++ b/sim/simx/pipeline.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+

 #pragma once

@@ -5,14 +18,38 @@
 #include <iostream>
 #include <util.h>
 #include "types.h"
-#include "archdef.h"
+#include "arch.h"
 #include "debug.h"

 namespace vortex {

+class ITraceData {
+public:
+    using Ptr = std::shared_ptr<ITraceData>;
+    ITraceData() {}
+    virtual ~ITraceData() {}
+};
+
+struct LsuTraceData : public ITraceData {
+  using Ptr = std::shared_ptr<LsuTraceData>;
+  std::vector<mem_addr_size_t> mem_addrs;
+  LsuTraceData(uint32_t num_threads) : mem_addrs(num_threads) {}
+};
+
+struct SFUTraceData : public ITraceData {
+  using Ptr = std::shared_ptr<SFUTraceData>;
+  struct {
+    uint32_t id;
+    uint32_t count;
+  } bar;
+  SFUTraceData(uint32_t bar_id, uint32_t bar_count) : bar{bar_id, bar_count} {}
+};
+
 struct pipeline_trace_t {
+public:
  //--
-  uint64_t    uuid;
+  const uint64_t uuid;
+  const Arch&    arch;
  
  //--
  uint32_t    cid;
@@ -21,12 +58,9 @@ struct pipeline_trace_t {
  Word        PC;

  //--
-  bool        fetch_stall;
-
-  //--
-  bool        wb;  
-  RegType     rdest_type;
  uint32_t    rdest;
+  RegType     rdest_type;
+  bool        wb;

  //--
  RegMask     used_iregs;
@@ -36,73 +70,104 @@ struct pipeline_trace_t {
  //- 
  ExeType     exe_type; 

-  //--
-  std::vector<std::vector<mem_addr_size_t>> mem_addrs;
-  
  //--
  union {
-    struct {        
-      LsuType type;
-    } lsu;
-    struct {
-      AluType type;
-    } alu;
-    struct {
-      FpuType type;
-    } fpu;
-    struct {
-      GpuType type;
-      WarpMask active_warps;
-    } gpu;
+    uint32_t unit_type;
+    LsuType  lsu_type;
+    AluType  alu_type;
+    FpuType  fpu_type;
+    SfuType  sfu_type;
  };

-  bool stalled;
+  ITraceData::Ptr data;

-  pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
-    uuid = uuid_;
-    cid = 0;
-    wid = 0;
-    tmask.reset();
-    PC = 0;
-    fetch_stall = false;
-    wb  = false;
-    rdest = 0;
-    rdest_type = RegType::None;
-    used_iregs.reset();
-    used_fregs.reset();
-    used_vregs.reset();
-    exe_type = ExeType::NOP;
-    mem_addrs.resize(arch.num_threads());
-    stalled = false;
-  }
+  int pid;
+  bool sop;
+  bool eop;

-  bool suspend() {
-    bool old = stalled;
-    stalled = true;
+  bool fetch_stall;
+
+  pipeline_trace_t(uint64_t uuid, const Arch& arch) 
+    : uuid(uuid)
+    , arch(arch)
+    , cid(0)
+    , wid(0)
+    , tmask(0)
+    , PC(0)    
+    , rdest(0)
+    , rdest_type(RegType::None)
+    , wb(false)
+    , used_iregs(0)
+    , used_fregs(0)
+    , used_vregs(0)
+    , exe_type(ExeType::ALU)
+    , unit_type(0)
+    , data(nullptr)
+    , pid(-1)
+    , sop(true)
+    , eop(true)
+    , fetch_stall(false)
+    , log_once_(false) 
+  {}
+
+  pipeline_trace_t(const pipeline_trace_t& rhs) 
+    : uuid(rhs.uuid)
+    , arch(rhs.arch)
+    , cid(rhs.cid)
+    , wid(rhs.wid)
+    , tmask(rhs.tmask)
+    , PC(rhs.PC)    
+    , rdest(rhs.rdest)
+    , rdest_type(rhs.rdest_type)
+    , wb(rhs.wb)    
+    , used_iregs(rhs.used_iregs)
+    , used_fregs(rhs.used_fregs)
+    , used_vregs(rhs.used_vregs)
+    , exe_type(rhs.exe_type)
+    , unit_type(rhs.unit_type)
+    , data(rhs.data)
+    , pid(rhs.pid)
+    , sop(rhs.sop)
+    , eop(rhs.eop)
+    , fetch_stall(rhs.fetch_stall)
+    , log_once_(false) 
+  {}
+  
+  ~pipeline_trace_t() {}
+
+  bool log_once(bool enable) {
+    bool old = log_once_;
+    log_once_ = enable;
    return old;
  }

-  void resume() {
-    stalled = false;
-  }
+private:
+  bool log_once_;
 };

 inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
-  os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
+  os << "cid=" << state.cid;
+  os << ", wid=" << state.wid;
+  os << ", tmask=";
+  for (uint32_t i = 0, n = state.arch.num_threads(); i < n; ++i) {
+      os << state.tmask.test(i);
+  }  
+  os << ", PC=0x" << std::hex << state.PC;
  os << ", wb=" << state.wb;
  if (state.wb) {
     os << ", rd=" << state.rdest_type << std::dec << state.rdest;
  }
  os << ", ex=" << state.exe_type;
+  if (state.pid != -1) {
+    os << ", pid=" << state.pid;
+    os << ", sop=" << state.sop;
+    os << ", eop=" << state.eop;
+  }
  os << " (#" << std::dec << state.uuid << ")";
  return os;
 }

 class PipelineLatch {
-protected:
-  const char* name_;
-  std::queue<pipeline_trace_t*> queue_;
-
 public:
  PipelineLatch(const char* name = nullptr) 
    : name_(name) 
@@ -132,6 +197,10 @@ public:
    std::queue<pipeline_trace_t*> empty;
    std::swap(queue_, empty );
  }
+
+protected:
+  const char* name_;
+  std::queue<pipeline_trace_t*> queue_;
 };

 }
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -1,168 +1,141 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "processor.h"
-#include "core.h"
-#include "constants.h"
+#include "processor_impl.h"

 using namespace vortex;

-class Processor::Impl {
-private:
-  std::vector<Core::Ptr> cores_;
-  std::vector<Cache::Ptr> l2caches_;
-  std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
-  Cache::Ptr l3cache_;
-  Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
+ProcessorImpl::ProcessorImpl(const Arch& arch) 
+  : arch_(arch)
+  , clusters_(arch.num_clusters())
+{
+  SimPlatform::instance().initialize();

-public:
-  Impl(const ArchDef& arch) 
-    : cores_(arch.num_cores())
-    , l2caches_(NUM_CLUSTERS)
-    , l2_mem_switches_(NUM_CLUSTERS)
-  {
-    SimPlatform::instance().initialize();
+  // create memory simulator
+  memsim_ = MemSim::Create("dram", MemSim::Config{
+    MEMORY_BANKS,
+    uint32_t(arch.num_cores()) * arch.num_clusters()
+  });

-    uint32_t num_cores = arch.num_cores();
-    uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
-
-    // create cores
-    for (uint32_t i = 0; i < num_cores; ++i) {
-        cores_.at(i) = Core::Create(arch, i);
+  // create L3 cache
+  l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
+    !L3_ENABLED,
+    log2ceil(L3_CACHE_SIZE),  // C
+    log2ceil(MEM_BLOCK_SIZE), // B
+    log2ceil(L3_NUM_WAYS),  // W
+    0,                      // A
+    XLEN,                   // address bits  
+    L3_NUM_BANKS,           // number of banks
+    1,                      // number of ports
+    uint8_t(arch.num_clusters()), // request size 
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    L3_MSHR_SIZE,           // mshr
+    2,                      // pipeline latency
    }
+  );        
+  
+  // connect L3 memory ports
+  l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
+  memsim_->MemRspPort.bind(&l3cache_->MemRspPort);

-     // setup memory simulator
-    auto memsim = MemSim::Create("dram", MemSim::Config{
-      MEMORY_BANKS,
-      arch.num_cores()
-    });
-    
-    std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
-    std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
-
-    if (L3_ENABLE) {
-      l3cache_ = Cache::Create("l3cache", Cache::Config{
-        log2ceil(L3_CACHE_SIZE),  // C
-        log2ceil(MEM_BLOCK_SIZE), // B
-        2,                      // W
-        0,                      // A
-        32,                     // address bits  
-        L3_NUM_BANKS,           // number of banks
-        L3_NUM_PORTS,           // number of ports
-        NUM_CLUSTERS,           // request size 
-        true,                   // write-through
-        false,                  // write response
-        0,                      // victim size
-        L3_MSHR_SIZE,           // mshr
-        2,                      // pipeline latency
-        }
-      );        
-      l3cache_->MemReqPort.bind(mem_req_ports.at(0));
-      mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
-        mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
-      }
-    } else if (NUM_CLUSTERS > 1) {
-      l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
-      l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
-      mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
-        mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
-      }
-    }
-
-    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-      std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster); 
-      std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
-
-      if (L2_ENABLE) {
-        auto& l2cache = l2caches_.at(i);
-        l2cache = Cache::Create("l2cache", Cache::Config{
-          log2ceil(L2_CACHE_SIZE),  // C
-          log2ceil(MEM_BLOCK_SIZE), // B
-          2,                      // W
-          0,                      // A
-          32,                     // address bits  
-          L2_NUM_BANKS,           // number of banks
-          L2_NUM_PORTS,           // number of ports
-          (uint8_t)cores_per_cluster, // request size 
-          true,                   // write-through
-          false,                  // write response
-          0,                      // victim size
-          L2_MSHR_SIZE,           // mshr
-          2,                      // pipeline latency
-        });
-        l2cache->MemReqPort.bind(mem_req_ports.at(i));
-        mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
-
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
-          cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
-        }
-      } else {
-        auto& l2_mem_switch = l2_mem_switches_.at(i);
-        l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
-        l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
-        mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
-
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
-          cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
-        }
-      }
-
-      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-        auto& core = cores_.at((i * cores_per_cluster) + j);
-        core->MemReqPort.bind(cluster_mem_req_ports.at(j));
-        cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
-      }
-    }
+  // create clusters
+  for (uint32_t i = 0; i < arch.num_clusters(); ++i) {
+    clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_);
+    // connect L3 core ports
+    clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i));
+    l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port);
  }

-  ~Impl() {
-    SimPlatform::instance().finalize();
-  }
+  // set up memory perf recording
+  memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
+    __unused (cycle);
+    perf_mem_reads_   += !req.write;
+    perf_mem_writes_  += req.write;
+    perf_mem_pending_reads_ += !req.write;
+  });
+  memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
+    __unused (cycle);
+    --perf_mem_pending_reads_;
+  });

-  void attach_ram(RAM* ram) {
-    for (auto core : cores_) {
-      core->attach_ram(ram);
-    }
-  }
+  this->reset();
+}

-  int run() {
-    SimPlatform::instance().reset();
-    bool running;
-    int exitcode = 0;
-    do {
-      SimPlatform::instance().tick();
-      running = false;
-      for (auto& core : cores_) {
-        if (core->running()) {
-          running = true;
-        }
-        if (core->check_exit()) {
-          exitcode = core->getIRegValue(3);
-          running = false;
-          break;
+ProcessorImpl::~ProcessorImpl() {
+  SimPlatform::instance().finalize();
+}
+
+void ProcessorImpl::attach_ram(RAM* ram) {
+  for (auto cluster : clusters_) {
+    cluster->attach_ram(ram);
+  }
+}
+
+int ProcessorImpl::run(bool riscv_test) {
+  SimPlatform::instance().reset();
+  this->reset();
+  
+  bool done;
+  Word exitcode = 0;
+  do {
+    SimPlatform::instance().tick();
+    done = true;
+    for (auto cluster : clusters_) {
+      if (cluster->running()) {
+        Word ec;   
+        if (cluster->check_exit(&ec, riscv_test)) {
+          exitcode |= ec;
+        } else {
+          done = false;
        }
      }
-    } while (running);
+    }
+    perf_mem_latency_ += perf_mem_pending_reads_;
+  } while (!done);

-    return exitcode;
-  }
-};
+  return exitcode;
+}
+ 
+void ProcessorImpl::reset() {
+  perf_mem_reads_ = 0;
+  perf_mem_writes_ = 0;
+  perf_mem_latency_ = 0;
+  perf_mem_pending_reads_ = 0;
+}
+
+void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
+  dcrs_.write(addr, value);
+}
+
+ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
+  ProcessorImpl::PerfStats perf;
+  perf.mem_reads   = perf_mem_reads_;
+  perf.mem_writes  = perf_mem_writes_;
+  perf.mem_latency = perf_mem_latency_;
+  perf.l3cache     = l3cache_->perf_stats();
+  for (auto cluster : clusters_) {
+    perf.clusters += cluster->perf_stats();
+  }   
+  return perf;
+}

 ///////////////////////////////////////////////////////////////////////////////

-Processor::Processor(const ArchDef& arch) 
-  : impl_(new Impl(arch))
+Processor::Processor(const Arch& arch) 
+  : impl_(new ProcessorImpl(arch))
 {}

 Processor::~Processor() {
@@ -173,6 +146,10 @@ void Processor::attach_ram(RAM* mem) {
  impl_->attach_ram(mem);
 }

-int Processor::run() {
-  return impl_->run();
+int Processor::run(bool riscv_test) {
+  return impl_->run(riscv_test);
+}
+
+void Processor::write_dcr(uint32_t addr, uint32_t value) {
+  return impl_->write_dcr(addr, value);
 }
--- a/sim/simx/processor.h
+++ b/sim/simx/processor.h
@@ -1,22 +1,39 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

+#include <stdint.h>
+
 namespace vortex {

-class ArchDef;
+class Arch;
 class RAM;
+class ProcessorImpl;

 class Processor {
 public:
-  Processor(const ArchDef& arch);
+  Processor(const Arch& arch);
  ~Processor();

  void attach_ram(RAM* mem);

-  int run();
+  int run(bool riscv_test);
+
+  void write_dcr(uint32_t addr, uint32_t value);

 private:
-  class Impl;
-  Impl* impl_;
+  ProcessorImpl* impl_;
 };

-}
+}
--- a/sim/simx/processor_impl.h
+++ b/sim/simx/processor_impl.h
@@ -0,0 +1,66 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mem_sim.h"
+#include "cache_sim.h"
+#include "constants.h"
+#include "dcrs.h"
+#include "cluster.h"
+
+namespace vortex {
+
+class ProcessorImpl {
+public:
+  struct PerfStats {
+    uint64_t mem_reads;
+    uint64_t mem_writes;
+    uint64_t mem_latency;
+    CacheSim::PerfStats l3cache;
+    Cluster::PerfStats clusters;
+
+    PerfStats()
+      : mem_reads(0)
+      , mem_writes(0)
+      , mem_latency(0)
+    {}
+  };
+
+  ProcessorImpl(const Arch& arch);
+  ~ProcessorImpl();
+
+  void attach_ram(RAM* mem);
+
+  int run(bool riscv_test);
+
+  void write_dcr(uint32_t addr, uint32_t value);
+
+  ProcessorImpl::PerfStats perf_stats() const;
+
+private:
+ 
+  void reset();
+
+  const Arch& arch_;
+  std::vector<std::shared_ptr<Cluster>> clusters_;
+  DCRS dcrs_;
+  MemSim::Ptr   memsim_;
+  CacheSim::Ptr l3cache_;
+  uint64_t perf_mem_reads_;
+  uint64_t perf_mem_writes_;
+  uint64_t perf_mem_latency_;
+  uint64_t perf_mem_pending_reads_;
+};
+
+}
--- a/sim/simx/scoreboard.h
+++ b/sim/simx/scoreboard.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "pipeline.h"
@@ -6,20 +19,15 @@
 namespace vortex {

 class Scoreboard {
-private:
+public:
+
    struct reg_use_t {
        RegType  type;
        uint32_t reg;        
        uint64_t owner;
    };
-
-    std::vector<RegMask> in_use_iregs_;
-    std::vector<RegMask> in_use_fregs_;
-    std::vector<RegMask> in_use_vregs_;
-    std::unordered_map<uint32_t, uint64_t> owners_; 
-
-public:    
-    Scoreboard(const ArchDef &arch) 
+        
+    Scoreboard(const Arch &arch) 
        : in_use_iregs_(arch.num_warps())
        , in_use_fregs_(arch.num_warps())
        , in_use_vregs_(arch.num_warps())
@@ -84,8 +92,7 @@ public:
    }
    
    void reserve(pipeline_trace_t* state) {
-        if (!state->wb)
-            return;  
+        assert(state->wb);  
        switch (state->rdest_type) {
        case RegType::Integer:            
            in_use_iregs_.at(state->wid).set(state->rdest);
@@ -105,8 +112,7 @@ public:
    }

    void release(pipeline_trace_t* state) {
-        if (!state->wb)
-            return;       
+        assert(state->wb);      
        switch (state->rdest_type) {
        case RegType::Integer:
            in_use_iregs_.at(state->wid).reset(state->rdest);
@@ -123,6 +129,13 @@ public:
        uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
        owners_.erase(tag);
    }
+
+private:
+
+    std::vector<RegMask> in_use_iregs_;
+    std::vector<RegMask> in_use_fregs_;
+    std::vector<RegMask> in_use_vregs_;
+    std::unordered_map<uint32_t, uint64_t> owners_;
 };

 }
--- a/sim/simx/shared_mem.cpp
+++ b/sim/simx/shared_mem.cpp
@@ -0,0 +1,138 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "shared_mem.h"
+#include "core.h"
+#include <bitmanip.h>
+#include <vector>
+#include "types.h"
+
+using namespace vortex;
+
+class SharedMem::Impl {
+protected:
+    SharedMem* simobject_;
+    Config    config_;
+    RAM       ram_;
+    uint32_t  bank_sel_addr_start_;
+    uint32_t  bank_sel_addr_end_;
+    PerfStats perf_stats_;
+
+    uint64_t to_local_addr(uint64_t addr) {
+        uint32_t total_lines = config_.capacity / config_.line_size;        
+        uint32_t line_bits = log2ceil(total_lines);
+        uint32_t offset = bit_getw(addr, 0, line_bits-1);
+        return offset;
+    }
+
+public:
+    Impl(SharedMem* simobject, const Config& config) 
+        : simobject_(simobject)
+        , config_(config)
+        , ram_(config.capacity, config.capacity)
+        , bank_sel_addr_start_(0)
+        , bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
+    {}    
+    
+    virtual ~Impl() {}
+
+    void reset() {
+        perf_stats_ = PerfStats();
+    }
+
+    void read(void* data, uint64_t addr, uint32_t size) {
+        auto s_addr = to_local_addr(addr);        
+        DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
+        ram_.read(data, s_addr, size);
+    }
+
+    void write(const void* data, uint64_t addr, uint32_t size) {
+        auto s_addr = to_local_addr(addr);        
+        DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
+        ram_.write(data, s_addr, size);
+    }
+
+    void tick() {
+        std::vector<bool> in_used_banks(config_.num_banks);
+        for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
+            auto& core_req_port = simobject_->Inputs.at(req_id);            
+            if (core_req_port.empty())
+                continue;
+
+            auto& core_req = core_req_port.front();
+
+            uint32_t bank_id = 0;
+            if (bank_sel_addr_start_ <= bank_sel_addr_end_) {
+                bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
+            }
+
+            // bank conflict check
+            if (in_used_banks.at(bank_id)) {
+                ++perf_stats_.bank_stalls;
+                continue;
+            }
+
+            in_used_banks.at(bank_id) = true;
+
+            if (!core_req.write || config_.write_reponse) {
+                // send response
+                MemRsp core_rsp{core_req.tag, core_req.cid};
+                simobject_->Outputs.at(req_id).send(core_rsp, 1);
+            }
+
+            // update perf counters
+            perf_stats_.reads += !core_req.write;            
+            perf_stats_.writes += core_req.write;
+
+            // remove input
+            core_req_port.pop();
+        }
+    }
+
+    const PerfStats& perf_stats() const { 
+        return perf_stats_; 
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+SharedMem::SharedMem(const SimContext& ctx, const char* name, const Config& config) 
+    : SimObject<SharedMem>(ctx, name)   
+    , Inputs(config.num_reqs, this)
+    , Outputs(config.num_reqs, this)
+    , impl_(new Impl(this, config))
+{}
+
+SharedMem::~SharedMem() {
+    delete impl_;
+}
+
+void SharedMem::reset() {
+    impl_->reset();
+}
+
+void SharedMem::read(void* data, uint64_t addr, uint32_t size) {
+    impl_->read(data, addr, size);
+}
+
+void SharedMem::write(const void* data, uint64_t addr, uint32_t size) {
+    impl_->write(data, addr, size);
+}
+
+void SharedMem::tick() {
+    impl_->tick();
+}
+
+const SharedMem::PerfStats& SharedMem::perf_stats() const {
+    return impl_->perf_stats();
+}
--- a/sim/simx/shared_mem.h
+++ b/sim/simx/shared_mem.h
@@ -0,0 +1,72 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "types.h"
+
+namespace vortex {
+
+class SharedMem : public SimObject<SharedMem> {
+public:
+  struct Config {
+    uint32_t capacity;
+    uint32_t line_size;
+    uint32_t num_reqs;
+    uint32_t num_banks;
+    bool write_reponse;
+  };
+
+  struct PerfStats {
+    uint64_t reads;
+    uint64_t writes;
+    uint64_t bank_stalls;
+
+    PerfStats() 
+      : reads(0)
+      , writes(0)
+      , bank_stalls(0)
+    {}
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->reads += rhs.reads;
+      this->writes += rhs.writes;
+      this->bank_stalls += rhs.bank_stalls;
+      return *this;
+    }
+  };
+
+  std::vector<SimPort<MemReq>> Inputs;
+  std::vector<SimPort<MemRsp>> Outputs;
+
+  SharedMem(const SimContext& ctx, const char* name, const Config& config);    
+  virtual ~SharedMem();
+
+  void reset();
+
+  void read(void* data, uint64_t addr, uint32_t size);
+
+  void write(const void* data, uint64_t addr, uint32_t size);
+
+  void tick();
+
+  const PerfStats& perf_stats() const;
+
+protected:
+
+  class Impl;
+  Impl* impl_;
+};
+
+}
--- a/sim/simx/sharedmem.h
+++ b/sim/simx/sharedmem.h
@@ -1,96 +0,0 @@
-#pragma once
-
-#include <simobject.h>
-#include <bitmanip.h>
-#include <vector>
-#include "types.h"
-
-namespace vortex {
-
-class Core;
-
-class SharedMem : public SimObject<SharedMem> {
-public:
-    struct Config {
-        uint32_t num_reqs;
-        uint32_t num_banks; 
-        uint32_t bank_offset;
-        uint32_t latency;
-        bool     write_reponse;
-    };
-
-    struct PerfStats {
-        uint64_t reads;
-        uint64_t writes;
-        uint64_t bank_stalls;
-
-        PerfStats() 
-            : reads(0)
-            , writes(0)
-            , bank_stalls(0)
-        {}
-    };
-
-    std::vector<SimPort<MemReq>> Inputs;
-    std::vector<SimPort<MemRsp>> Outputs;
-
-    SharedMem(const SimContext& ctx, const char* name, const Config& config) 
-        : SimObject<SharedMem>(ctx, name)
-        , Inputs(config.num_reqs, this)
-        , Outputs(config.num_reqs, this)
-        , config_(config)
-        , bank_sel_addr_start_(config.bank_offset)
-        , bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
-    {}    
-    
-    virtual ~SharedMem() {}
-
-    void reset() {
-        perf_stats_ = PerfStats();
-    }
-
-    void tick() {
-        std::vector<bool> in_used_banks(config_.num_banks);
-        for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
-            auto& core_req_port = this->Inputs.at(req_id);            
-            if (core_req_port.empty())
-                continue;
-
-            auto& core_req = core_req_port.front();
-
-            uint32_t bank_id = (uint32_t)bit_getw(
-                core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
-
-            // bank conflict check
-            if (in_used_banks.at(bank_id))
-                continue;
-
-            in_used_banks.at(bank_id) = true;
-
-            if (!core_req.write || config_.write_reponse) {
-                // send response
-                MemRsp core_rsp{core_req.tag, core_req.core_id};
-                this->Outputs.at(req_id).send(core_rsp, 1);
-            }
-
-            // update perf counters
-            perf_stats_.reads += !core_req.write;            
-            perf_stats_.writes += core_req.write;
-
-            // remove input
-            core_req_port.pop();
-        }
-    }
-
-    const PerfStats& perf_stats() const { 
-        return perf_stats_; 
-    }
-
-protected:
-    Config    config_;
-    uint32_t  bank_sel_addr_start_;
-    uint32_t  bank_sel_addr_end_;
-    PerfStats perf_stats_;
-};
-
-}
--- a/sim/simx/tex_unit.cpp
+++ b/sim/simx/tex_unit.cpp
@@ -1,100 +0,0 @@
-#include "tex_unit.h"
-#include "core.h"
-#include <texturing.h>
-#include <VX_config.h>
-
-using namespace vortex;
-using namespace cocogfx;
-
-enum class FilterMode {
-  Point,
-  Bilinear,
-  Trilinear,
-};
-
-TexUnit::TexUnit(Core* core) : core_(core) {}
-
-TexUnit::~TexUnit() {}
-
-void TexUnit::clear() {
-  for (auto& state : states_) {
-    state = 0;
-  }
-}
-
-uint32_t TexUnit::get_state(uint32_t state) {
-  return states_.at(state);
-}
-  
-void TexUnit::set_state(uint32_t state, uint32_t value) {
-  states_.at(state) = value;
-}
-
-uint32_t TexUnit::read(int32_t u, 
-                       int32_t v, 
-                       int32_t lod, 
-                       std::vector<mem_addr_size_t>* mem_addrs) {
-  //--
-  auto xu = Fixed<TEX_FXD_FRAC>::make(u);
-  auto xv = Fixed<TEX_FXD_FRAC>::make(v);
-  uint32_t base_addr  = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
-  uint32_t log_width  = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
-  uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
-  auto format         = (TexFormat)states_.at(TEX_STATE_FORMAT);    
-  auto filter         = (FilterMode)states_.at(TEX_STATE_FILTER);    
-  auto wrapu          = (WrapMode)states_.at(TEX_STATE_WRAPU);
-  auto wrapv          = (WrapMode)states_.at(TEX_STATE_WRAPV);
-
-  auto stride = Stride(format);
-  
-  switch (filter) {
-  case FilterMode::Bilinear: {
-    // addressing
-    uint32_t offset00, offset01, offset10, offset11;
-    uint32_t alpha, beta;
-    TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, 
-      &offset00, &offset01, &offset10, &offset11, &alpha, &beta);
-
-    uint32_t addr00 = base_addr + offset00 * stride;
-    uint32_t addr01 = base_addr + offset01 * stride;
-    uint32_t addr10 = base_addr + offset10 * stride;
-    uint32_t addr11 = base_addr + offset11 * stride;
-
-    // memory lookup
-    uint32_t texel00(0), texel01(0), texel10(0), texel11(0);
-    core_->dcache_read(&texel00, addr00, stride);
-    core_->dcache_read(&texel01, addr01, stride);
-    core_->dcache_read(&texel10, addr10, stride);
-    core_->dcache_read(&texel11, addr11, stride);
-
-    mem_addrs->push_back({addr00, stride});
-    mem_addrs->push_back({addr01, stride});
-    mem_addrs->push_back({addr10, stride});
-    mem_addrs->push_back({addr11, stride});
-
-    // filtering
-    auto color = TexFilterLinear(
-      format, texel00, texel01, texel10, texel11, alpha, beta);
-    return color;
-  }
-  case FilterMode::Point: {
-    // addressing
-    uint32_t offset;
-    TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
-    
-    uint32_t addr = base_addr + offset * stride;
-
-    // memory lookup
-    uint32_t texel(0);
-    core_->dcache_read(&texel, addr, stride);
-    mem_addrs->push_back({addr, stride});
-
-    // filtering
-    auto color = TexFilterPoint(format, texel);
-    return color;
-  }
-  default:
-    std::abort();
-    return 0;
-  }
-}
--- a/sim/simx/tex_unit.h
+++ b/sim/simx/tex_unit.h
@@ -1,28 +0,0 @@
-#pragma once
-
-#include "types.h"
-
-namespace vortex {
-
-class Core;
-
-class TexUnit {
-public:
-    TexUnit(Core* core);
-    ~TexUnit();
-
-    void clear();
-
-    uint32_t get_state(uint32_t state);
-  
-    void set_state(uint32_t state, uint32_t value);
-
-    uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
-
-private:
-
-    std::array<uint32_t, NUM_TEX_STATES> states_;
-    Core* core_;
-};
-
-}
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <stdint.h>
@@ -5,31 +18,42 @@
 #include <queue>
 #include <unordered_map>
 #include <util.h>
+#include <stringutil.h>
 #include <VX_config.h>
 #include <simobject.h>
+#include "uuid_gen.h"
+#include "debug.h"

 namespace vortex {

 typedef uint8_t Byte;
-#if XLEN == 32
+#if (XLEN == 32)
 typedef uint32_t Word;
 typedef int32_t  WordI;
 typedef uint64_t DWord;
 typedef int64_t  DWordI;
-#elif XLEN == 64
+typedef uint32_t WordF;
+#elif (XLEN == 64)
 typedef uint64_t Word;
 typedef int64_t  WordI;
 typedef __uint128_t DWord;
 typedef __int128_t DWordI;
+typedef uint64_t WordF;
 #else
 #error unsupported XLEN
 #endif

-typedef uint64_t FWord;
+#define MAX_NUM_CORES   1024
+#define MAX_NUM_THREADS 32
+#define MAX_NUM_WARPS   32
+#define MAX_NUM_REGS    32

-typedef std::bitset<32> RegMask;
-typedef std::bitset<32> ThreadMask;
-typedef std::bitset<32> WarpMask;
+typedef std::bitset<MAX_NUM_CORES>   CoreMask;
+typedef std::bitset<MAX_NUM_REGS>    RegMask;
+typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
+typedef std::bitset<MAX_NUM_WARPS>   WarpMask;
+
+typedef std::unordered_map<uint32_t, uint32_t> CSRs;

 ///////////////////////////////////////////////////////////////////////////////

@@ -40,8 +64,8 @@ enum class RegType {
  Vector
 };

-inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
-  switch (clss) {
+inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
+  switch (type) {
  case RegType::None: break;
  case RegType::Integer: os << "x"; break;  
  case RegType::Float:   os << "f"; break;
@@ -53,23 +77,19 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
 ///////////////////////////////////////////////////////////////////////////////

 enum class ExeType {
-  NOP,
  ALU,
  LSU,
-  CSR,
  FPU,
-  GPU,
+  SFU,
  MAX,
 };

 inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
  switch (type) {
-  case ExeType::NOP: os << "NOP"; break;
  case ExeType::ALU: os << "ALU"; break;
  case ExeType::LSU: os << "LSU"; break;
-  case ExeType::CSR: os << "CSR"; break;
  case ExeType::FPU: os << "FPU"; break;
-  case ExeType::GPU: os << "GPU"; break;
+  case ExeType::SFU: os << "SFU"; break;
  case ExeType::MAX: break;
  }
  return os;
@@ -82,8 +102,7 @@ enum class AluType {
  BRANCH,
  SYSCALL,
  IMUL,
-  IDIV,    
-  CMOV,
+  IDIV
 };

 inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -93,7 +112,6 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
  case AluType::SYSCALL: os << "SYSCALL"; break;
  case AluType::IMUL:    os << "IMUL"; break;
  case AluType::IDIV:    os << "IDIV"; break;
-  case AluType::CMOV:    os << "CMOV"; break;
  }
  return os;
 }
@@ -103,16 +121,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
 enum class LsuType {
  LOAD,
  STORE,
-  FENCE,
-  PREFETCH,    
+  FENCE
 };

 inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
  switch (type) {
-  case LsuType::LOAD:     os << "LOAD"; break;
-  case LsuType::STORE:    os << "STORE"; break;
-  case LsuType::FENCE:    os << "FENCE"; break;
-  case LsuType::PREFETCH: os << "PREFETCH"; break;
+  case LsuType::LOAD:  os << "LOAD"; break;
+  case LsuType::STORE: os << "STORE"; break;
+  case LsuType::FENCE: os << "FENCE"; break;
  }
  return os;
 }
@@ -141,21 +157,6 @@ struct mem_addr_size_t {
  uint32_t size;
 };

-inline AddrType get_addr_type(Word addr, uint32_t size) {
-  __unused (size);
-  if (SM_ENABLE) {
-    if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
-    &&  addr < SMEM_BASE_ADDR) {      
-      assert((addr + size) <= SMEM_BASE_ADDR);
-      return AddrType::Shared;
-    }
-  }
-  if (addr >= IO_BASE_ADDR) {
-     return AddrType::IO;
-  }
-  return AddrType::Global;
-}
-
 ///////////////////////////////////////////////////////////////////////////////

 enum class FpuType {
@@ -179,23 +180,31 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {

 ///////////////////////////////////////////////////////////////////////////////

-enum class GpuType {
+enum class SfuType {
  TMC,
  WSPAWN,
  SPLIT,
  JOIN,
  BAR,
-  TEX,
+  PRED,
+  CSRRW,
+  CSRRS,
+  CSRRC,
+  CMOV  
 };

-inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
+inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
  switch (type) {
-  case GpuType::TMC:    os << "TMC"; break;
-  case GpuType::WSPAWN: os << "WSPAWN"; break;
-  case GpuType::SPLIT:  os << "SPLIT"; break;
-  case GpuType::JOIN:   os << "JOIN"; break;
-  case GpuType::BAR:    os << "BAR"; break;
-  case GpuType::TEX:    os << "TEX"; break;
+  case SfuType::TMC:    os << "TMC"; break;
+  case SfuType::WSPAWN: os << "WSPAWN"; break;
+  case SfuType::SPLIT:  os << "SPLIT"; break;
+  case SfuType::JOIN:   os << "JOIN"; break;
+  case SfuType::BAR:    os << "BAR"; break;
+  case SfuType::PRED:   os << "PRED"; break;
+  case SfuType::CSRRW:  os << "CSRRW"; break;
+  case SfuType::CSRRS:  os << "CSRRS"; break;
+  case SfuType::CSRRC:  os << "CSRRC"; break;
+  case SfuType::CMOV:   os << "CMOV"; break;
  }
  return os;
 }
@@ -218,31 +227,32 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
 ///////////////////////////////////////////////////////////////////////////////

 struct MemReq {
-    uint64_t addr;
-    bool write;
-    bool non_cacheable;
-    uint32_t tag;
-    uint32_t core_id;    
-    uint64_t uuid;
+  uint64_t addr;
+  bool write;
+  AddrType type;
+  uint32_t tag;
+  uint32_t cid;    
+  uint64_t uuid;

-    MemReq(uint64_t _addr = 0, 
-           bool _write = false,
-           bool _non_cacheable = false,
-           uint64_t _tag = 0, 
-           uint32_t _core_id = 0,
-           uint64_t _uuid = 0
-    )   : addr(_addr)
-        , write(_write)
-        , non_cacheable(_non_cacheable)
-        , tag(_tag)
-        , core_id(_core_id)
-        , uuid(_uuid)
-    {}
+  MemReq(uint64_t _addr = 0, 
+          bool _write = false,
+          AddrType _type = AddrType::Global,
+          uint64_t _tag = 0, 
+          uint32_t _cid = 0,
+          uint64_t _uuid = 0
+  ) : addr(_addr)
+    , write(_write)
+    , type(_type)
+    , tag(_tag)
+    , cid(_cid)
+    , uuid(_uuid)
+  {}
 };

 inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
  os << "mem-" << (req.write ? "wr" : "rd") << ": ";
-  os << "addr=" << std::hex << req.addr << std::dec << ", tag=" << req.tag << ", core_id=" << req.core_id;
+  os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
+  os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
  os << " (#" << std::dec << req.uuid << ")";
  return os;
 }
@@ -250,18 +260,19 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
 ///////////////////////////////////////////////////////////////////////////////

 struct MemRsp {
-    uint64_t tag;    
-    uint32_t core_id;
-    uint64_t uuid;
-    MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
-      : tag (_tag) 
-      , core_id(_core_id)
-      , uuid(_uuid)
-    {}
+  uint64_t tag;    
+  uint32_t cid;
+  uint64_t uuid;
+  
+  MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
+    : tag (_tag) 
+    , cid(_cid)
+    , uuid(_uuid)
+  {}
 };

 inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
-  os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
+  os << "mem-rsp: tag=" << rsp.tag << ", cid=" << rsp.cid;
  os << " (#" << std::dec << rsp.uuid << ")";
  return os;
 }
@@ -270,10 +281,6 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {

 template <typename T>
 class HashTable {
-private:
-  std::vector<std::pair<bool, T>> entries_;
-  uint32_t size_;
-
 public:    
  HashTable(uint32_t capacity)
    : entries_(capacity)
@@ -336,92 +343,180 @@ public:
    }
    size_ = 0;
  }
+
+private:
+  std::vector<std::pair<bool, T>> entries_;
+  uint32_t size_;
 };

 ///////////////////////////////////////////////////////////////////////////////

-template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
+template <typename Req, typename Rsp>
 class Switch : public SimObject<Switch<Req, Rsp>> {
-private:
-  ArbiterType type_;
-  uint32_t delay_;  
-  uint32_t cursor_;
-  uint32_t tag_shift_;
-
 public:
+  std::vector<SimPort<Req>>  ReqIn;
+  std::vector<SimPort<Rsp>>  RspIn;
+
+  std::vector<SimPort<Req>>  ReqOut;  
+  std::vector<SimPort<Rsp>>  RspOut;
+
  Switch(
    const SimContext& ctx, 
    const char* name, 
    ArbiterType type, 
-    uint32_t num_inputs, 
+    uint32_t num_inputs = 1, 
+    uint32_t num_outputs = 1,
    uint32_t delay = 1
  ) 
-    : SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)    
+    : SimObject<Switch<Req, Rsp>>(ctx, name)    
+    , ReqIn(num_inputs,   this)
+    , RspIn(num_inputs,   this)
+    , ReqOut(num_outputs, this)    
+    , RspOut(num_outputs, this)
    , type_(type)
    , delay_(delay)
-    , cursor_(0)
-    , tag_shift_(log2ceil(num_inputs))
-    , ReqIn(num_inputs, this)
-    , ReqOut(this)
-    , RspIn(this)    
-    , RspOut(num_inputs, this)
+    , cursors_(num_outputs, 0)
+    , lg_num_reqs_(log2ceil(num_inputs / num_outputs))
  {
-    assert(delay_ != 0);
-    assert(num_inputs <= MaxInputs);
-    if (num_inputs == 1) {
-      // bypass
-      ReqIn.at(0).bind(&ReqOut);
-      RspIn.bind(&RspOut.at(0));
+    assert(delay != 0);    
+    assert(num_inputs <= 32);
+    assert(num_outputs <= 32);
+    assert(num_inputs >= num_outputs);
+
+    if (num_inputs == num_outputs) {
+      // bypass mode
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        ReqIn.at(i).bind(&ReqOut.at(i));
+        RspOut.at(i).bind(&RspIn.at(i));
+      }
    }
  }

  void reset() {
-    cursor_ = 0;
+    for (auto& cursor : cursors_) {
+      cursor = 0;
+    }
  }

-  void tick() {  
-    if (ReqIn.size() == 1)
+  void tick() {
+    uint32_t I = ReqIn.size();
+    uint32_t O = ReqOut.size();
+    uint32_t R = 1 << lg_num_reqs_;
+
+    // skip bypass mode
+    if (I == O)
      return;
        
-    // process incomming requests    
-    for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {      
-      uint32_t j = (cursor_ + i) % n;
-      auto& req_in = ReqIn.at(j);      
-      if (!req_in.empty()) {
-        auto& req = req_in.front();
-        if (tag_shift_) {
-          req.tag = (req.tag << tag_shift_) | j;
+    // process incomming requests        
+    for (uint32_t o = 0; o < O; ++o) {
+      for (uint32_t r = 0; r < R; ++r) {
+        uint32_t i = (cursors_.at(o) + r) & (R-1);
+        uint32_t j = o * R + i;
+        if (j >= I)
+          continue;
+        
+        auto& req_in = ReqIn.at(j);
+        if (!req_in.empty()) {
+          auto& req = req_in.front();
+          if (lg_num_reqs_ != 0) {
+            req.tag = (req.tag << lg_num_reqs_) | i;
+          }
+          DT(4, this->name() << "-" << req);
+          ReqOut.at(o).send(req, delay_);                
+          req_in.pop();
+          this->update_cursor(o, i);
+          break;
        }
-        ReqOut.send(req, delay_);                
-        req_in.pop();
-        this->update_cursor(j);
-        break;
      }
-    } 
-
-    // process incoming reponses
-    if (!RspIn.empty()) {
-      auto& rsp = RspIn.front();    
-      uint32_t port_id = 0;
-      if (tag_shift_) {
-        port_id = rsp.tag & ((1 << tag_shift_)-1);
-        rsp.tag >>= tag_shift_;
-      }      
-      RspOut.at(port_id).send(rsp, 1);
-      RspIn.pop();
+      
+      // process incoming reponses
+      if (!RspOut.at(o).empty()) {
+        auto& rsp = RspOut.at(o).front();
+        uint32_t i = 0;
+        if (lg_num_reqs_ != 0) {
+          i = rsp.tag & (R-1);
+          rsp.tag >>= lg_num_reqs_;
+        }      
+        DT(4, this->name() << "-" << rsp);
+        uint32_t j = o * R + i;
+        RspIn.at(j).send(rsp, 1);      
+        RspOut.at(o).pop();
+      }
    }
  }

-  void update_cursor(uint32_t grant) {
+  void update_cursor(uint32_t index, uint32_t grant) {
    if (type_ == ArbiterType::RoundRobin) {
-      cursor_ = grant + 1;
+      cursors_.at(index) = grant + 1;
    }
  }

-  std::vector<SimPort<Req>>  ReqIn;
-  SimPort<Req>              ReqOut;
-  SimPort<Rsp>               RspIn;    
-  std::vector<SimPort<Rsp>> RspOut;
+private:
+  ArbiterType type_;
+  uint32_t delay_;  
+  std::vector<uint32_t> cursors_;
+  uint32_t lg_num_reqs_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class SMemDemux : public SimObject<SMemDemux> {
+public:
+  SimPort<MemReq>  ReqIn;
+  SimPort<MemRsp>  RspIn;
+
+  SimPort<MemReq>  ReqSm;
+  SimPort<MemRsp>  RspSm;
+
+  SimPort<MemReq>  ReqDc;
+  SimPort<MemRsp>  RspDc;
+
+  SMemDemux(
+    const SimContext& ctx, 
+    const char* name, 
+    uint32_t delay = 1
+  ) : SimObject<SMemDemux>(ctx, name)    
+    , ReqIn(this)
+    , RspIn(this)
+    , ReqSm(this)
+    , RspSm(this)
+    , ReqDc(this)
+    , RspDc(this)
+    , delay_(delay)
+  {}
+
+  void reset() {}
+
+  void tick() {
+    // process incomming requests  
+    if (!ReqIn.empty()) {
+      auto& req = ReqIn.front();
+      DT(4, this->name() << "-" << req);
+      if (req.type == AddrType::Shared) {
+        ReqSm.send(req, delay_);
+      } else {
+        ReqDc.send(req, delay_);
+      }
+      ReqIn.pop();
+    }   
+      
+    // process incoming reponses
+    if (!RspSm.empty()) {
+      auto& rsp = RspSm.front();
+      DT(4, this->name() << "-" << rsp);
+      RspIn.send(rsp, 1);
+      RspSm.pop();
+    }
+    if (!RspDc.empty()) {
+      auto& rsp = RspDc.front();
+      DT(4, this->name() << "-" << rsp);
+      RspIn.send(rsp, 1);
+      RspDc.pop();
+    }
+  }
+
+private:
+  uint32_t delay_;
 };

 }
--- a/sim/simx/warp.cpp
+++ b/sim/simx/warp.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <stdlib.h>
 #include <unistd.h>
@@ -10,21 +23,25 @@

 using namespace vortex;

-Warp::Warp(Core *core, uint32_t id)
-    : id_(id)
+Warp::Warp(Core *core, uint32_t warp_id)
+    : warp_id_(warp_id)
+    , arch_(core->arch())
    , core_(core)
    , ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
-    , freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
+    , freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
    , vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
 {
-  this->clear();
+  this->reset();
 }

-void Warp::clear() {
-  active_ = false;
-  PC_ = STARTUP_ADDR;
+void Warp::reset() {
+  PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
+#if (XLEN == 64)
+  PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
+#endif
  tmask_.reset();  
-  for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i) {
+  issued_instrs_ = 0;
+  for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
    for (auto& reg : ireg_file_.at(i)) {
      reg = 0;
    }
@@ -35,31 +52,44 @@ void Warp::clear() {
      reg = 0;
    }
  }
+  uui_gen_.reset();
 }

-void Warp::eval(pipeline_trace_t *trace) {
+pipeline_trace_t* Warp::eval() {
  assert(tmask_.any());

-  DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
-  for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i)
-    DPN(2, tmask_.test(n-i-1));
-  DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
-
-  /* Fetch and decode. */    
+#ifndef NDEBUG
+  uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
+  uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
+  uint32_t instr_id  = instr_uuid & 0xffff;
+  uint32_t instr_ref = instr_uuid >> 16;
+  uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
+#else
+  uint64_t uuid = 0;
+#endif
+  
+  DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
+  for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
+    DPN(1, tmask_.test(i));
+  DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);

+  // Fetch
  uint32_t instr_code = 0;
  core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
-  auto instr = core_->decoder().decode(instr_code);
+
+  // Decode
+  auto instr = core_->decoder_.decode(instr_code);
  if (!instr) {
-    std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
+    std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
    std::abort();
  }  

-  DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
+  DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);

-  // Update trace
+  // Create trace
+  auto trace = new pipeline_trace_t(uuid, arch_);
  trace->cid   = core_->id();
-  trace->wid   = id_;
+  trace->wid   = warp_id_;
  trace->PC    = PC_;
  trace->tmask = tmask_;
  trace->rdest = instr->getRDest();
@@ -68,18 +98,20 @@ void Warp::eval(pipeline_trace_t *trace) {
  // Execute
  this->execute(*instr, trace);

-  DP(4, "Register state:");
-  for (uint32_t i = 0; i < core_->arch().num_regs(); ++i) {
-    DPN(4, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
+  DP(5, "Register state:");
+  for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
+    DPN(5, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
    // Integer register file
-    for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
-      DPN(4, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
+    for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
+      DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
    }
-    DPN(4, '|');
+    DPN(5, '|');
    // Floating point register file
-    for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
-      DPN(4, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
+    for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
+      DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
    }
-    DPN(4, std::endl);
+    DPN(5, std::endl);
  }  
+
+  return trace;
 }
--- a/sim/simx/warp.h
+++ b/sim/simx/warp.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #ifndef __WARP_H
 #define __WARP_H

@@ -7,28 +20,26 @@

 namespace vortex {

+class Arch;
 class Core;
 class Instr;
 class pipeline_trace_t;
+
 struct DomStackEntry {
  DomStackEntry(const ThreadMask &tmask, Word PC) 
    : tmask(tmask)
    , PC(PC)
-    , fallThrough(false)
-    , unanimous(false) 
+    , fallthrough(false)
  {}

-  DomStackEntry(const ThreadMask &tmask)
-      : tmask(tmask)
-      , PC(0)
-      , fallThrough(true)
-      , unanimous(false) 
+  DomStackEntry(const ThreadMask &tmask) 
+    : tmask(tmask)
+    , fallthrough(true)
  {}

  ThreadMask tmask;
  Word PC;
-  bool fallThrough;
-  bool unanimous;
+  bool fallthrough;
 };

 struct vtype {
@@ -40,72 +51,58 @@ struct vtype {

 class Warp {
 public:
-  Warp(Core *core, uint32_t id);
+  Warp(Core *core, uint32_t warp_id);

-  void clear();
-  
-  bool active() const {
-    return active_;
-  }
-
-  void suspend() {
-    active_ = false;
-  }
-
-  void activate() {
-    active_ = true;
-  }
-
-  std::size_t getActiveThreads() const {
-    if (active_)
-      return tmask_.count();
-    return 0;
-  }
+  void reset();

  uint32_t id() const {
-    return id_;
+    return warp_id_;
  }

-  uint32_t getPC() const {
+  Word getPC() const {
    return PC_;
  }

-  void setPC(uint32_t PC) {
+  void setPC(Word PC) {
    PC_ = PC;
  }

  void setTmask(size_t index, bool value) {
    tmask_.set(index, value);
-    active_ = tmask_.any();
  }

-  uint32_t getTmask() const {
-    if (active_)
-      return tmask_.to_ulong();
-    return 0;
+  uint64_t getTmask() const {
+    return tmask_.to_ulong();
  }

-  uint32_t getIRegValue(uint32_t reg) const {
+  Word getIRegValue(uint32_t reg) const {
    return ireg_file_.at(0).at(reg);
  }

-  void eval(pipeline_trace_t *);
+  uint64_t incr_instrs() {
+    return issued_instrs_++;
+  }
+
+  pipeline_trace_t* eval();

 private:

  void execute(const Instr &instr, pipeline_trace_t *trace);
+
+  UUIDGenerator uui_gen_;
  
-  uint32_t id_;
+  uint32_t warp_id_;
+  const Arch& arch_;
  Core *core_;
-  bool active_;
+  uint64_t issued_instrs_;
  
  Word PC_;
-  ThreadMask tmask_;  
-  
-  std::vector<std::vector<Word>> ireg_file_;
-  std::vector<std::vector<FWord>> freg_file_;
-  std::vector<std::vector<Byte>> vreg_file_;
-  std::stack<DomStackEntry> dom_stack_;
+  ThreadMask tmask_;
+
+  std::vector<std::vector<Word>>     ireg_file_;
+  std::vector<std::vector<uint64_t>> freg_file_;
+  std::vector<std::vector<Byte>>     vreg_file_;
+  std::stack<DomStackEntry>          ipdom_stack_;

  struct vtype vtype_;
  uint32_t vl_;