Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -1,45 +1,36 @@
+XLEN ?= 32
 DESTDIR ?= .
 RTL_DIR = ../hw/rtl
 THIRD_PARTY_DIR = ../../third_party

-CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
 CXXFLAGS += -I. -I../common -I../../hw
 CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
 CXXFLAGS += -I$(THIRD_PARTY_DIR)
+CXXFLAGS += -DXLEN_$(XLEN)
 CXXFLAGS += $(CONFIGS)

-LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a 
-LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx 
+LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
-SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp
-
-OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
-VPATH := $(sort $(dir $(SRCS)))
-
-#$(info OBJS is $(OBJS))
-#$(info VPATH is $(VPATH))
+SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp

 # Debugigng
 ifdef DEBUG
 	CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
+	#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer
 else    
 	CXXFLAGS += -O2 -DNDEBUG
 endif

-# XLEN parameterization
-ifdef XLEN
-	CXXFLAGS += -DXLEN=$(XLEN)
-endif
-
 PROJECT = simx

 all: $(DESTDIR)/$(PROJECT)
 	
 $(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
-	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+	$(CXX) $(CXXFLAGS) -DSTARTUP_ADDR=0x80000000 $^ $(LDFLAGS) -o $@

 $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@
@@ -48,4 +39,4 @@ $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
 	$(CXX) $(CXXFLAGS) -MM $^ > .depend;

 clean:
-	rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
+	rm -rf $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -0,0 +1,87 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+
+#include <cstdlib>
+#include <stdio.h>
+#include "types.h"
+
+namespace vortex {
+
+class Arch {  
+private:
+  uint16_t num_threads_;
+  uint16_t num_warps_;
+  uint16_t num_cores_;  
+  uint16_t num_clusters_;  
+  uint16_t vsize_;
+  uint16_t num_regs_;
+  uint16_t num_csrs_;
+  uint16_t num_barriers_;
+  uint16_t ipdom_size_;
+  
+public:
+  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)   
+    : num_threads_(num_threads)
+    , num_warps_(num_warps)
+    , num_cores_(num_cores)
+    , num_clusters_(num_clusters)
+    , vsize_(16)
+    , num_regs_(32)
+    , num_csrs_(4096)
+    , num_barriers_(NUM_BARRIERS)
+    , ipdom_size_((num_threads-1) * 2)
+  {}
+
+  uint16_t vsize() const { 
+    return vsize_; 
+  }
+
+  uint16_t num_regs() const {
+    return num_regs_;
+  }
+
+  uint16_t num_csrs() const {
+    return num_csrs_;
+  }
+
+  uint16_t num_barriers() const {
+    return num_barriers_;
+  }
+
+  uint16_t ipdom_size() const {
+    return ipdom_size_;
+  }
+
+  uint16_t num_threads() const {
+    return num_threads_;
+  }
+
+  uint16_t num_warps() const {
+    return num_warps_;
+  }
+
+  uint16_t num_cores() const {
+    return num_cores_;
+  }
+  
+  uint16_t num_clusters() const {
+    return num_clusters_;
+  }
+};
+
+}
--- a/sim/simx/archdef.h
+++ b/sim/simx/archdef.h
@@ -1,70 +0,0 @@
-#pragma once
-
-#include <string>
-#include <sstream>
-
-#include <cstdlib>
-#include <stdio.h>
-#include "types.h"
-
-namespace vortex {
-
-class ArchDef {  
-private:
-  uint16_t num_cores_;
-  uint16_t num_warps_;
-  uint16_t num_threads_;
-  uint16_t wsize_;
-  uint16_t vsize_;
-  uint16_t num_regs_;
-  uint16_t num_csrs_;
-  uint16_t num_barriers_;
-  
-public:
-  ArchDef(uint16_t num_cores, 
-          uint16_t num_warps, 
-          uint16_t num_threads)   
-    : num_cores_(num_cores)
-    , num_warps_(num_warps)
-    , num_threads_(num_threads)
-    , wsize_(4)
-    , vsize_(16)
-    , num_regs_(32)
-    , num_csrs_(4096)
-    , num_barriers_(NUM_BARRIERS)
-  {}
-
-  uint16_t wsize() const { 
-    return wsize_; 
-  }
-
-  uint16_t vsize() const { 
-    return vsize_; 
-  }
-
-  uint16_t num_regs() const {
-    return num_regs_;
-  }
-
-  uint16_t num_csrs() const {
-    return num_csrs_;
-  }
-
-  uint16_t num_barriers() const {
-    return num_barriers_;
-  }
-
-  uint16_t num_threads() const {
-    return num_threads_;
-  }
-
-  uint16_t num_warps() const {
-    return num_warps_;
-  }
-
-  uint16_t num_cores() const {
-    return num_cores_;
-  }
-};
-
-}
--- a/sim/simx/args.cpp
+++ b/sim/simx/args.cpp
@@ -1,47 +0,0 @@
-#include <iostream>
-#include <string>
-#include "args.h"
-
-using namespace vortex;
-using std::string;
-
-std::string CommandLineArg::helpString_;
-std::unordered_map<string, CommandLineArg *> CommandLineArg::longArgs_;
-std::unordered_map<string, CommandLineArg *> CommandLineArg::shortArgs_;
-
-CommandLineArg::CommandLineArg(string s, string l, const char *helpText) {
-  helpString_ += helpText;
-  longArgs_[l] = this;
-  shortArgs_[s] = this;
-}
-
-CommandLineArg::CommandLineArg(string l, const char *helpText) {
-  helpString_ += helpText;
-  longArgs_[l] = this;
-}
-
-void CommandLineArg::readArgs(int argc, char **argv) {
-  for (int i = 0; i < argc; i++) {
-    std::unordered_map<string, CommandLineArg *>::iterator 
-      s = shortArgs_.find(std::string(argv[i])), 
-      l = longArgs_.find(std::string(argv[i]));
-
-    if (s != shortArgs_.end()) {
-      i += s->second->read(argc - i, &argv[i]);
-    } else if (l != longArgs_.end()) {
-      i += l->second->read(argc - i, &argv[i]);
-    } else {
-      throw BadArg(string(argv[i]));
-    }
-  }
-}
-
-void CommandLineArg::clearArgs() {
-  shortArgs_.clear();
-  longArgs_.clear();
-  helpString_ = "";
-}
-
-void CommandLineArg::showHelp(std::ostream &os) {
-  os << helpString_;
-}
--- a/sim/simx/args.h
+++ b/sim/simx/args.h
@@ -1,64 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <unordered_map>
-#include <util.h>
-
-namespace vortex {
-
-struct BadArg { BadArg(std::string s) : arg(s) {} std::string arg; };
-
-class CommandLineArg {
-public:
-  CommandLineArg(std::string s, std::string l, const char *helpText);
-  CommandLineArg(std::string l, const char *helpText);
-  virtual int read(int argc, char** argv) = 0;
-
-  static void readArgs(int argc, char **argv);
-  static void clearArgs();
-  static void showHelp(std::ostream &os);
-
-private:
-  static std::string helpString_;
-  static std::unordered_map<std::string, CommandLineArg *> longArgs_;
-  static std::unordered_map<std::string, CommandLineArg *> shortArgs_;
-};
-
-template <typename T> class CommandLineArgSetter : public CommandLineArg {
-public:
-  CommandLineArgSetter(std::string s, std::string l, const char *ht, T &x) :
-    CommandLineArg(s, l, ht), arg_(x) {}
-
-  CommandLineArgSetter(std::string l, const char *ht, T &x) :
-    CommandLineArg(l, ht), arg_(x) {}
-
-  int read(int argc, char **argv) {
-    __unused (argc);
-    std::istringstream iss(argv[1]);
-    iss >> arg_;
-    return 1;
-  }
-private:
-  T &arg_;
-};
-
-class CommandLineArgFlag : public CommandLineArg {
-public:
-  CommandLineArgFlag(std::string s, std::string l, const char *ht, bool &x) :
-    CommandLineArg(s, l, ht), arg_(x) { arg_ = false; }
-
-  CommandLineArgFlag(std::string l, const char *ht, bool &x) :
-    CommandLineArg(l, ht), arg_(x) { arg_ = false; }
-
-  int read(int argc, char **argv) { 
-    __unused (argc, argv);
-    arg_ = true; 
-    return 0; 
-  }
-private:
-  bool &arg_;
-};
-  
-}
--- a/sim/simx/cache.cpp
+++ b/sim/simx/cache.cpp
@@ -1,637 +0,0 @@
-#include "cache.h"
-#include "debug.h"
-#include "types.h"
-#include <util.h>
-#include <unordered_map>
-#include <vector>
-#include <list>
-#include <queue>
-
-using namespace vortex;
-
-struct params_t {
-    uint32_t sets_per_bank;
-    uint32_t blocks_per_set;    
-    uint32_t words_per_block;
-    uint32_t log2_num_inputs;
-
-    uint32_t word_select_addr_start;
-    uint32_t word_select_addr_end;
-
-    uint32_t bank_select_addr_start;
-    uint32_t bank_select_addr_end;
-
-    uint32_t set_select_addr_start;
-    uint32_t set_select_addr_end;
-
-    uint32_t tag_select_addr_start;
-    uint32_t tag_select_addr_end;
-
-    params_t(const Cache::Config& config) {
-        uint32_t bank_bits   = log2ceil(config.num_banks);
-        uint32_t offset_bits = config.B - config.W;
-        uint32_t log2_bank_size  = config.C - bank_bits;
-        uint32_t index_bits  = log2_bank_size - (config.B << config.A);        
-        assert(log2_bank_size >= config.B);   
-
-        this->log2_num_inputs = log2ceil(config.num_inputs);
-
-        this->words_per_block = 1 << offset_bits;
-        this->blocks_per_set  = 1 << config.A;
-        this->sets_per_bank   = 1 << index_bits;
-
-        assert(config.ports_per_bank <= this->words_per_block);
-                
-        // Word select
-        this->word_select_addr_start = config.W;
-        this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
-
-        // Bank select
-        this->bank_select_addr_start = (1+this->word_select_addr_end);
-        this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
-
-        // Set select
-        this->set_select_addr_start = (1+this->bank_select_addr_end);
-        this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
-
-        // Tag select
-        this->tag_select_addr_start = (1+this->set_select_addr_end);
-        this->tag_select_addr_end = (config.addr_width-1);
-    }
-
-    uint32_t addr_bank_id(uint64_t word_addr) const {
-        if (bank_select_addr_end >= bank_select_addr_start)
-            return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
-        else    
-            return 0;
-    }
-
-    uint32_t addr_set_id(uint64_t word_addr) const {
-        if (set_select_addr_end >= set_select_addr_start)
-            return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
-        else
-            return 0;
-    }
-
-    uint64_t addr_tag(uint64_t word_addr) const {
-        if (tag_select_addr_end >= tag_select_addr_start)
-            return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
-        else    
-            return 0;
-    }
-    
-    uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
-        uint64_t addr(0);
-        if (bank_select_addr_end >= bank_select_addr_start)            
-            addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
-        if (set_select_addr_end >= set_select_addr_start)
-            addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
-        if (tag_select_addr_end >= tag_select_addr_start)
-            addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
-        return addr;
-    }
-};
-
-struct block_t {
-    bool     valid;
-    bool     dirty;        
-    uint64_t tag;
-    uint32_t lru_ctr;
-};
-
-struct set_t {
-    std::vector<block_t> blocks;    
-    set_t(uint32_t size) : blocks(size) {}
-
-    void clear() {
-        for (auto& block : blocks) {
-            block.valid = false;
-        }
-    }
-};
-
-struct bank_req_info_t {
-    bool     valid;    
-    uint32_t req_id;
-    uint64_t req_tag;
-};
-
-struct bank_req_t {
-    bool valid;
-    bool write;
-    bool mshr_replay;
-    uint64_t tag;
-    uint32_t set_id;
-    uint32_t core_id;
-    uint64_t uuid;
-    std::vector<bank_req_info_t> infos;
-
-    bank_req_t(uint32_t size) 
-        : valid(false)
-        , write(false)
-        , mshr_replay(false)
-        , tag(0)
-        , set_id(0)
-        , core_id(0)
-        , uuid(0)
-        , infos(size)
-    {}
-};
-
-struct mshr_entry_t : public bank_req_t {
-    uint32_t block_id;
-
-    mshr_entry_t(uint32_t size = 0) 
-        : bank_req_t(size) 
-        , block_id(0)
-    {}
-};
-
-class MSHR {
-private:
-    std::vector<mshr_entry_t> entries_;
-    uint32_t size_;
-
-public:    
-    MSHR(uint32_t size)
-        : entries_(size)
-        , size_(0) 
-    {}
-
-    bool empty() const {
-        return (0 == size_);
-    }
-    
-    bool full() const {
-        return (size_ == entries_.size());
-    }
-
-    int lookup(const bank_req_t& bank_req) {
-         for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
-            auto& entry = entries_.at(i);
-            if (entry.valid 
-             && entry.set_id == bank_req.set_id 
-             && entry.tag == bank_req.tag) {
-                return i;
-            }
-        }
-        return -1;
-    }
-
-    int allocate(const bank_req_t& bank_req, uint32_t block_id) {
-        for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
-            auto& entry = entries_.at(i);
-            if (!entry.valid) {
-                *(bank_req_t*)&entry = bank_req;
-                entry.valid = true;
-                entry.mshr_replay = false;
-                entry.block_id = block_id;  
-                ++size_;              
-                return i;
-            }
-        }
-        return -1;
-    }
-
-    mshr_entry_t& replay(uint32_t id) {
-        auto& root_entry = entries_.at(id);
-        assert(root_entry.valid);
-        // make all related mshr entries for replay
-        for (auto& entry : entries_) {
-            if (entry.valid 
-             && entry.set_id == root_entry.set_id 
-             && entry.tag == root_entry.tag) {
-                entry.mshr_replay = true;
-            }
-        }
-        return root_entry;
-    }
-
-    bool pop(bank_req_t* out) {
-        for (auto& entry : entries_) {
-            if (entry.valid && entry.mshr_replay) {
-                *out = entry;
-                entry.valid = false;
-                --size_;
-                return true;
-            }
-        }
-        return false;
-    }
-
-    void clear() {
-        for (auto& entry : entries_) {
-            if (entry.valid && entry.mshr_replay) {
-                entry.valid = false;
-            }
-        }
-        size_ = 0;
-    }
-};
-
-struct bank_t {
-    std::vector<set_t>  sets;    
-    MSHR                mshr;
-
-    bank_t(const Cache::Config& config, 
-           const params_t& params) 
-        : sets(params.sets_per_bank, params.blocks_per_set)
-        , mshr(config.mshr_size)
-    {}
-
-    void clear() {
-        mshr.clear();
-        for (auto& set : sets) {
-            set.clear();
-        }
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-class Cache::Impl {
-private:
-    Cache* const simobject_;
-    Config config_;
-    params_t params_;
-    std::vector<bank_t> banks_;
-    Switch<MemReq, MemRsp>::Ptr mem_switch_;    
-    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
-    std::vector<SimPort<MemReq>> mem_req_ports_;
-    std::vector<SimPort<MemRsp>>  mem_rsp_ports_;
-    uint32_t flush_cycles_;
-    PerfStats perf_stats_;
-    uint64_t pending_read_reqs_;
-    uint64_t pending_write_reqs_;
-    uint64_t pending_fill_reqs_;    
-
-public:
-    Impl(Cache* simobject, const Config& config) 
-        : simobject_(simobject)
-        , config_(config)
-        , params_(config)
-        , banks_(config.num_banks, {config, params_})
-        , mem_req_ports_(config.num_banks, simobject)
-        , mem_rsp_ports_(config.num_banks, simobject)
-    {
-        bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
-        bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
-        simobject->MemRspPort.bind(&bypass_switch_->RspIn);
-
-        if (config.num_banks > 1) {
-            mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
-            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
-                mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
-                mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
-            }    
-            mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
-            bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
-        } else {
-            mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
-            bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
-        }
-
-        // calculate tag flush cycles
-        flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
-    }
-
-    void reset() {
-        for (auto& bank : banks_) {
-            bank.clear();
-        }
-        perf_stats_ = PerfStats();
-        pending_read_reqs_ = 0;
-        pending_write_reqs_ = 0;
-        pending_fill_reqs_ = 0;
-    }
-
-    void tick() {
-        // wait on flush cycles
-        if (flush_cycles_ != 0) {
-            --flush_cycles_;
-            return;
-        }
-
-        // per-bank pipeline request
-        std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
-
-        // calculate memory latency
-        perf_stats_.mem_latency += pending_fill_reqs_;
-
-        // handle bypasss responses
-        auto& bypass_port = bypass_switch_->RspOut.at(1);            
-        if (!bypass_port.empty()) {
-            auto& mem_rsp = bypass_port.front();
-            uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
-            uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
-            MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
-            simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
-            DT(3, simobject_->name() << "-" << core_rsp);
-            bypass_port.pop();
-        }        
-
-        // handle MSHR replay
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& bank = banks_.at(bank_id);
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-            bank.mshr.pop(&pipeline_req);
-        }       
-
-        // handle memory fills
-        std::vector<bool> pending_fill_req(config_.num_banks, false);
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
-            if (!mem_rsp_port.empty()) {
-                auto& mem_rsp = mem_rsp_port.front();
-                this->processMemoryFill(bank_id, mem_rsp.tag);                
-                pending_fill_req.at(bank_id) = true;
-                mem_rsp_port.pop();
-            }
-        }
-        
-        // handle incoming core requests
-        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
-            auto& core_req_port = simobject_->CoreReqPorts.at(req_id);            
-            if (core_req_port.empty())
-                continue;
-
-            auto& core_req = core_req_port.front();
-
-            // check cache bypassing
-            if (core_req.non_cacheable) {
-                // send IO request
-                this->processIORequest(core_req, req_id);
-
-                // remove request
-                core_req_port.pop();
-                continue;
-            }
-
-            auto bank_id = params_.addr_bank_id(core_req.addr);
-            auto set_id  = params_.addr_set_id(core_req.addr);
-            auto tag     = params_.addr_tag(core_req.addr);
-            auto port_id = req_id % config_.ports_per_bank;
-            
-            // create bank request
-            bank_req_t bank_req(config_.ports_per_bank);
-            bank_req.valid = true;
-            bank_req.write = core_req.write;
-            bank_req.mshr_replay = false;
-            bank_req.tag = tag;            
-            bank_req.set_id = set_id;       
-            bank_req.core_id = core_req.core_id;
-            bank_req.uuid = core_req.uuid;
-            bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
-
-            auto& bank = banks_.at(bank_id);            
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-
-            // check pending MSHR replay
-            if (pipeline_req.valid 
-             && pipeline_req.mshr_replay) {
-                 // stall
-                continue;
-            }    
-
-            // check pending fill request
-            if (pending_fill_req.at(bank_id)) {
-                // stall
-                continue;
-            }
-            
-            // check MSHR capacity if read or writeback
-            if ((!core_req.write || !config_.write_through)
-             && bank.mshr.full()) {
-                ++perf_stats_.mshr_stalls;
-                continue;
-            }    
-
-            // check bank conflicts
-            if (pipeline_req.valid) {
-                // check port conflict
-                if (pipeline_req.write != core_req.write
-                 || pipeline_req.set_id != set_id
-                 || pipeline_req.tag != tag
-                 || pipeline_req.infos[port_id].valid) {
-                    ++perf_stats_.bank_stalls;
-                    continue;
-                }
-                // update pending request infos
-                pipeline_req.infos[port_id] = bank_req.infos[port_id];
-            } else {
-                // schedule new request
-                pipeline_req = bank_req;
-            }
-
-            if (core_req.write)
-                ++perf_stats_.writes;
-            else
-                ++perf_stats_.reads;
-
-            // remove request
-            auto time = core_req_port.pop();
-            perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
-        }
-    
-        // process active request        
-        this->processBankRequest(pipeline_reqs);
-    } 
-
-    const PerfStats& perf_stats() const {
-        return perf_stats_;
-    }
-
-private:
-    
-    void processIORequest(const MemReq& core_req, uint32_t req_id) {
-        {
-            MemReq mem_req(core_req);
-            mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
-            bypass_switch_->ReqIn.at(1).send(mem_req, 1);
-            DT(3, simobject_->name() << "-" << mem_req);
-        }
-
-        if (core_req.write && config_.write_reponse) {
-            MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
-            simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);            
-            DT(3, simobject_->name() << "-" << core_rsp);
-        }
-    }
-
-    void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
-        // update block
-        auto& bank  = banks_.at(bank_id);
-        auto& entry = bank.mshr.replay(mshr_id);
-        auto& set   = bank.sets.at(entry.set_id);
-        auto& block = set.blocks.at(entry.block_id);
-        block.valid = true;
-        block.tag   = entry.tag;
-        --pending_fill_reqs_;
-    }
-
-    void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-            if (!pipeline_req.valid)
-                continue;
-
-            auto& bank = banks_.at(bank_id);
-            auto& set = bank.sets.at(pipeline_req.set_id);
-
-            if (pipeline_req.mshr_replay) {
-                // send core response
-                for (auto& info : pipeline_req.infos) {
-                    MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                    simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);  
-                    DT(3, simobject_->name() << "-" << core_rsp);         
-                }
-            } else {        
-                bool hit = false;
-                bool found_free_block = false;            
-                uint32_t hit_block_id = 0;
-                uint32_t repl_block_id = 0;            
-                uint32_t max_cnt = 0;
-                
-                for (uint32_t i = 0, n = set.blocks.size(); i < n; ++i) {
-                    auto& block = set.blocks.at(i);
-                    if (block.valid) {
-                        if (block.tag == pipeline_req.tag) {
-                            block.lru_ctr = 0;                        
-                            hit_block_id = i;
-                            hit = true;
-                        } else {
-                            ++block.lru_ctr;
-                        }
-                        if (max_cnt < block.lru_ctr) {
-                            max_cnt = block.lru_ctr;
-                            repl_block_id = i;
-                        }
-                    } else {                    
-                        found_free_block = true;
-                        repl_block_id = i;
-                    }
-                }
-
-                if (hit) {     
-                    //
-                    // Hit handling   
-                    //                
-                    if (pipeline_req.write) {
-                        // handle write hit
-                        auto& hit_block = set.blocks.at(hit_block_id);
-                        if (config_.write_through) {
-                            // forward write request to memory
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                        } else {
-                            // mark block as dirty
-                            hit_block.dirty = true;
-                        }
-                    }
-                    // send core response
-                    if (!pipeline_req.write || config_.write_reponse) {
-                        for (auto& info : pipeline_req.infos) {     
-                            MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                            simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
-                            DT(3, simobject_->name() << "-" << core_rsp);
-                        }
-                    }
-                } else {     
-                    //
-                    // Miss handling   
-                    //
-                    if (pipeline_req.write)
-                        ++perf_stats_.write_misses;
-                    else
-                        ++perf_stats_.read_misses;
-
-                    if (!found_free_block && !config_.write_through) {
-                        // write back dirty block
-                        auto& repl_block = set.blocks.at(repl_block_id);
-                        if (repl_block.dirty) {                       
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                            ++perf_stats_.evictions;
-                        }
-                    }
-
-                    if (pipeline_req.write && config_.write_through) {
-                        // forward write request to memory
-                        {
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                        }
-                        // send core response
-                        if (config_.write_reponse) {
-                            for (auto& info : pipeline_req.infos) {         
-                                MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                                simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
-                                DT(3, simobject_->name() << "-" << core_rsp);
-                            }
-                        }
-                    } else {
-                        // MSHR lookup
-                        int pending = bank.mshr.lookup(pipeline_req);
-
-                        // allocate MSHR
-                        int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
-                        
-                        // send fill request
-                        if (pending == -1) {
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
-                            mem_req.write = false;
-                            mem_req.tag   = mshr_id;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                            ++pending_fill_reqs_;
-                        }
-                    }
-                }
-            }
-        }
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-Cache::Cache(const SimContext& ctx, const char* name, const Config& config) 
-    : SimObject<Cache>(ctx, name)    
-    , CoreReqPorts(config.num_inputs, this)
-    , CoreRspPorts(config.num_inputs, this)
-    , MemReqPort(this)
-    , MemRspPort(this)
-    , impl_(new Impl(this, config))
-{}
-
-Cache::~Cache() {
-    delete impl_;
-}
-
-void Cache::reset() {
-    impl_->reset();
-}
-
-void Cache::tick() {
-    impl_->tick();
-}
-
-const Cache::PerfStats& Cache::perf_stats() const {
-    return impl_->perf_stats();
-}
--- a/sim/simx/cache_cluster.h
+++ b/sim/simx/cache_cluster.h
@@ -0,0 +1,106 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cache_sim.h"
+
+namespace vortex {
+
+class CacheCluster : public SimObject<CacheCluster> {
+public:
+    std::vector<std::vector<SimPort<MemReq>>> CoreReqPorts;
+    std::vector<std::vector<SimPort<MemRsp>>> CoreRspPorts;
+    SimPort<MemReq> MemReqPort;
+    SimPort<MemRsp> MemRspPort;
+
+    CacheCluster(const SimContext& ctx, 
+                 const char* name, 
+                 uint32_t num_units, 
+                 uint32_t num_caches, 
+                 uint32_t num_requests,
+                 const CacheSim::Config& config) 
+        : SimObject(ctx, name)        
+        , CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
+        , CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
+        , MemReqPort(this)
+        , MemRspPort(this)
+        , caches_(MAX(num_caches, 0x1)) {
+
+        CacheSim::Config config2(config);
+        if (0 == num_caches) {
+            num_caches = 1;
+            config2.bypass = true;
+        }
+
+        char sname[100];
+        
+        std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
+        for (uint32_t u = 0; u < num_units; ++u) {
+            snprintf(sname, 100, "%s-unit-arb-%d", name, u);
+            unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
+            for (uint32_t i = 0; i < num_requests; ++i) {
+                this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
+                unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
+            }
+        }
+
+        std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
+        for (uint32_t i = 0; i < config.num_inputs; ++i) {
+            snprintf(sname, 100, "%s-mem-arb-%d", name, i);
+            mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
+            for (uint32_t u = 0; u < num_units; ++u) {              
+                unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
+                mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
+            }            
+        }
+
+        snprintf(sname, 100, "%s-cache-arb", name);
+        auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
+
+        for (uint32_t i = 0; i < num_caches; ++i) {
+            snprintf(sname, 100, "%s-cache%d", name, i);
+            caches_.at(i) = CacheSim::Create(sname, config2);
+
+            for (uint32_t j = 0; j < config.num_inputs; ++j) {
+                mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
+                caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
+            }
+
+            caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i));
+            cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort);
+        }
+
+        cache_arb->ReqOut.at(0).bind(&this->MemReqPort);
+        this->MemRspPort.bind(&cache_arb->RspOut.at(0));
+    }
+
+    ~CacheCluster() {}
+
+    void reset() {}
+    
+    void tick() {}
+
+    CacheSim::PerfStats perf_stats() const {
+        CacheSim::PerfStats perf;
+        for (auto cache : caches_) {
+            perf += cache->perf_stats();
+        }   
+        return perf;
+    }
+    
+private:
+    std::vector<CacheSim::Ptr> caches_;
+};
+
+}
--- a/sim/simx/cache_sim.cpp
+++ b/sim/simx/cache_sim.cpp
@@ -0,0 +1,707 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cache_sim.h"
+#include "debug.h"
+#include "types.h"
+#include <util.h>
+#include <unordered_map>
+#include <vector>
+#include <list>
+#include <queue>
+
+using namespace vortex;
+
+struct params_t {
+    uint32_t sets_per_bank;
+    uint32_t lines_per_set;    
+    uint32_t words_per_line;
+    uint32_t log2_num_inputs;
+
+    uint32_t word_select_addr_start;
+    uint32_t word_select_addr_end;
+
+    uint32_t bank_select_addr_start;
+    uint32_t bank_select_addr_end;
+
+    uint32_t set_select_addr_start;
+    uint32_t set_select_addr_end;
+
+    uint32_t tag_select_addr_start;
+    uint32_t tag_select_addr_end;
+
+    params_t(const CacheSim::Config& config) {
+        int32_t bank_bits = log2ceil(config.num_banks);
+        int32_t offset_bits = config.B - config.W;
+        int32_t log2_bank_size = config.C - bank_bits;
+        int32_t index_bits = log2_bank_size - (config.B + config.A);        
+        assert(log2_bank_size > 0);
+        assert(offset_bits >= 0);
+        assert(index_bits >= 0);
+
+        this->log2_num_inputs = log2ceil(config.num_inputs);
+
+        this->words_per_line = 1 << offset_bits;
+        this->lines_per_set  = 1 << config.A;
+        this->sets_per_bank   = 1 << index_bits;
+
+        assert(config.ports_per_bank <= this->words_per_line);
+                
+        // Word select
+        this->word_select_addr_start = config.W;
+        this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
+
+        // Bank select
+        this->bank_select_addr_start = (1+this->word_select_addr_end);
+        this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
+
+        // Set select
+        this->set_select_addr_start = (1+this->bank_select_addr_end);
+        this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
+
+        // Tag select
+        this->tag_select_addr_start = (1+this->set_select_addr_end);
+        this->tag_select_addr_end = (config.addr_width-1);
+    }
+
+    uint32_t addr_bank_id(uint64_t word_addr) const {
+        if (bank_select_addr_end >= bank_select_addr_start)
+            return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
+        else    
+            return 0;
+    }
+
+    uint32_t addr_set_id(uint64_t word_addr) const {
+        if (set_select_addr_end >= set_select_addr_start)
+            return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
+        else
+            return 0;
+    }
+
+    uint64_t addr_tag(uint64_t word_addr) const {
+        if (tag_select_addr_end >= tag_select_addr_start)
+            return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
+        else    
+            return 0;
+    }
+    
+    uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
+        uint64_t addr(0);
+        if (bank_select_addr_end >= bank_select_addr_start)            
+            addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
+        if (set_select_addr_end >= set_select_addr_start)
+            addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
+        if (tag_select_addr_end >= tag_select_addr_start)
+            addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
+        return addr;
+    }
+};
+
+struct line_t {  
+    uint64_t tag;
+    uint32_t lru_ctr;
+    bool     valid;
+    bool     dirty;
+
+    void clear() {
+        valid = false;
+        dirty = false;
+    }
+};
+
+struct set_t {
+    std::vector<line_t> lines;
+
+    set_t(uint32_t num_ways) 
+        : lines(num_ways) 
+    {}
+
+    void clear() {
+        for (auto& line : lines) {
+            line.clear();
+        }
+    }
+};
+
+struct bank_req_port_t {
+    uint32_t req_id;
+    uint64_t req_tag;
+    bool     valid;
+
+    void clear() {
+        valid = false;   
+    }
+};
+
+struct bank_req_t {
+
+    enum ReqType {
+        None   = 0,
+        Fill   = 1,
+        Replay = 2,        
+        Core   = 3
+    };
+
+    std::vector<bank_req_port_t> ports;
+    uint64_t tag;
+    uint32_t set_id;
+    uint32_t cid;
+    uint64_t uuid;
+    ReqType  type;
+    bool     write;
+
+    bank_req_t(uint32_t num_ports)
+        : ports(num_ports) 
+    {}
+
+    void clear() {
+        for (auto& port : ports) {
+            port.clear();
+        }
+        type = ReqType::None;
+    }
+};
+
+struct mshr_entry_t {
+    bank_req_t bank_req;
+    uint32_t   line_id;
+
+    mshr_entry_t(uint32_t num_ports) 
+        : bank_req(num_ports) 
+    {}
+
+    void clear() {
+        bank_req.clear();
+    }
+};
+
+class MSHR {
+private:
+    std::vector<mshr_entry_t> entries_;
+    uint32_t size_;
+
+public:    
+    MSHR(uint32_t size, uint32_t num_ports)
+        : entries_(size, num_ports)
+        , size_(0) 
+    {}
+
+    bool empty() const {
+        return (0 == size_);
+    }
+    
+    bool full() const {
+        return (size_ == entries_.size());
+    }
+
+    bool lookup(const bank_req_t& bank_req) {
+         for (auto& entry : entries_) {;
+            if (entry.bank_req.type != bank_req_t::None
+             && entry.bank_req.set_id == bank_req.set_id 
+             && entry.bank_req.tag == bank_req.tag) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    int allocate(const bank_req_t& bank_req, uint32_t line_id) {
+        for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
+            auto& entry = entries_.at(i);
+            if (entry.bank_req.type == bank_req_t::None) {
+                entry.bank_req = bank_req;
+                entry.line_id = line_id;  
+                ++size_;              
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    mshr_entry_t& replay(uint32_t id) {
+        auto& root_entry = entries_.at(id);
+        assert(root_entry.bank_req.type == bank_req_t::Core);
+        // mark all related mshr entries for replay
+        for (auto& entry : entries_) {
+            if (entry.bank_req.type == bank_req_t::Core 
+             && entry.bank_req.set_id == root_entry.bank_req.set_id 
+             && entry.bank_req.tag == root_entry.bank_req.tag) {
+                entry.bank_req.type = bank_req_t::Replay;
+            }
+        }
+        return root_entry;
+    }
+
+    bool pop(bank_req_t* out) {
+        for (auto& entry : entries_) {
+            if (entry.bank_req.type == bank_req_t::Replay) {
+                *out = entry.bank_req;
+                entry.bank_req.type = bank_req_t::None;
+                --size_;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void clear() {
+        for (auto& entry : entries_) {
+            entry.clear();
+        }
+        size_ = 0;
+    }
+};
+
+struct bank_t {
+    std::vector<set_t> sets;    
+    MSHR               mshr;
+
+    bank_t(const CacheSim::Config& config, 
+           const params_t& params) 
+        : sets(params.sets_per_bank, params.lines_per_set)
+        , mshr(config.mshr_size, config.ports_per_bank)
+    {}
+
+    void clear() {        
+        for (auto& set : sets) {
+            set.clear();
+        }
+        mshr.clear();
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class CacheSim::Impl {
+private:
+    CacheSim* const simobject_;
+    Config config_;
+    params_t params_;
+    std::vector<bank_t> banks_;
+    Switch<MemReq, MemRsp>::Ptr bank_switch_;    
+    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
+    std::vector<SimPort<MemReq>> mem_req_ports_;
+    std::vector<SimPort<MemRsp>> mem_rsp_ports_;
+    std::vector<bank_req_t> pipeline_reqs_;
+    uint32_t init_cycles_;
+    PerfStats perf_stats_;
+    uint64_t pending_read_reqs_;
+    uint64_t pending_write_reqs_;
+    uint64_t pending_fill_reqs_;
+
+public:
+    Impl(CacheSim* simobject, const Config& config) 
+        : simobject_(simobject)
+        , config_(config)
+        , params_(config)
+        , banks_(config.num_banks, {config, params_})
+        , mem_req_ports_(config.num_banks, simobject)
+        , mem_rsp_ports_(config.num_banks, simobject)
+        , pipeline_reqs_(config.num_banks, config.ports_per_bank)
+    {
+        char sname[100];
+        snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
+
+        if (config_.bypass) {            
+            bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);            
+            for (uint32_t i = 0; i < config_.num_inputs; ++i) {
+               simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
+               bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
+            }
+            bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
+            simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
+            return;
+        }
+        
+        bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
+        bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
+        simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
+
+        if (config.num_banks > 1) {
+            snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
+            bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
+            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
+                mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
+                bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
+            }    
+            bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0));
+            bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0));
+        } else {
+            mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
+            bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0));
+        }
+
+        // calculate cache initialization cycles
+        init_cycles_ = params_.sets_per_bank * params_.lines_per_set;
+    }
+
+    void reset() {
+        if (config_.bypass)
+            return;
+
+        for (auto& bank : banks_) {
+            bank.clear();
+        }
+        perf_stats_ = PerfStats();
+        pending_read_reqs_  = 0;
+        pending_write_reqs_ = 0;
+        pending_fill_reqs_  = 0;
+    }
+
+    void tick() {
+        if (config_.bypass)
+            return;
+
+        // wait on cache initialization cycles
+        if (init_cycles_ != 0) {
+            --init_cycles_;
+            return;
+        }
+
+        // handle cache bypasss responses
+        {
+            auto& bypass_port = bypass_switch_->RspIn.at(1);            
+            if (!bypass_port.empty()) {
+                auto& mem_rsp = bypass_port.front();
+                this->processBypassResponse(mem_rsp);
+                bypass_port.pop();
+            }
+        }
+
+        // initialize pipeline request
+        for (auto& pipeline_req : pipeline_reqs_) {
+            pipeline_req.clear();
+        }
+
+        // schedule MSHR replay
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& bank = banks_.at(bank_id);
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+            bank.mshr.pop(&pipeline_req);
+        }
+
+        // schedule memory fill
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
+            if (mem_rsp_port.empty())
+                continue;
+
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+            if (pipeline_req.type != bank_req_t::None)
+                continue;
+
+            auto& mem_rsp = mem_rsp_port.front();            
+            DT(3, simobject_->name() << "-dram-" << mem_rsp);
+            pipeline_req.type = bank_req_t::Fill;
+            pipeline_req.tag = mem_rsp.tag;
+            mem_rsp_port.pop();
+        }
+
+        // schedule core requests        
+        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
+            auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
+            if (core_req_port.empty())
+                continue;
+
+            auto& core_req = core_req_port.front();
+
+            // check cache bypassing
+            if (core_req.type == AddrType::IO) {
+                // send bypass request
+                this->processBypassRequest(core_req, req_id);
+                // remove request
+                core_req_port.pop();
+                continue;
+            }
+
+            auto bank_id = params_.addr_bank_id(core_req.addr);
+            auto set_id  = params_.addr_set_id(core_req.addr);
+            auto tag     = params_.addr_tag(core_req.addr);
+            auto port_id = req_id % config_.ports_per_bank;
+
+            auto& bank = banks_.at(bank_id);
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+
+            // check MSHR capacity
+            if ((!core_req.write || !config_.write_through)
+             && bank.mshr.full()) {
+                ++perf_stats_.mshr_stalls;
+                ++perf_stats_.bank_stalls;
+                continue;
+            }            
+
+            // check bank conflicts
+            if (pipeline_req.type == bank_req_t::Core) {
+                // check port conflict
+                if (pipeline_req.write != core_req.write
+                 || pipeline_req.set_id != set_id
+                 || pipeline_req.tag != tag
+                 || pipeline_req.ports.at(port_id).valid) {
+                    ++perf_stats_.bank_stalls;
+                    continue;
+                }
+                // extend request ports
+                pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
+            } else if (pipeline_req.type == bank_req_t::None) {
+                // schedule new request
+                bank_req_t bank_req(config_.ports_per_bank);
+                bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
+                bank_req.tag   = tag;            
+                bank_req.set_id = set_id;       
+                bank_req.cid   = core_req.cid;
+                bank_req.uuid  = core_req.uuid;
+                bank_req.type  = bank_req_t::Core;
+                bank_req.write = core_req.write;
+                pipeline_req   = bank_req;
+            } else {
+                // bank in use
+                ++perf_stats_.bank_stalls;
+                continue;
+            }
+
+            if (core_req.write)
+                ++perf_stats_.writes;
+            else
+                ++perf_stats_.reads;
+
+            // remove request
+            DT(3, simobject_->name() << "-core-" << core_req);
+            auto time = core_req_port.pop();
+            perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
+        }
+    
+        // process active request        
+        this->processBankRequests();
+    } 
+
+    const PerfStats& perf_stats() const {
+        return perf_stats_;
+    }
+
+private:
+    
+    void processBypassResponse(const MemRsp& mem_rsp) {
+        uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
+        uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
+        MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid};
+        simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
+        DT(3, simobject_->name() << "-core-" << core_rsp);
+    }
+
+    void processBypassRequest(const MemReq& core_req, uint32_t req_id) {
+        DT(3, simobject_->name() << "-core-" << core_req);
+
+        {
+            MemReq mem_req(core_req);
+            mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
+            bypass_switch_->ReqIn.at(1).send(mem_req, 1);
+            DT(3, simobject_->name() << "-dram-" << mem_req);
+        }
+
+        if (core_req.write && config_.write_reponse) {
+            MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
+            simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);            
+            DT(3, simobject_->name() << "-core-" << core_rsp);
+        }
+    }
+
+    void processBankRequests() {
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& bank = banks_.at(bank_id);
+            auto pipeline_req = pipeline_reqs_.at(bank_id);
+            
+            switch (pipeline_req.type) {
+            case bank_req_t::None:
+                break;
+            case bank_req_t::Fill: {
+                // update cache line
+                auto& bank  = banks_.at(bank_id);
+                auto& entry = bank.mshr.replay(pipeline_req.tag);
+                auto& set   = bank.sets.at(entry.bank_req.set_id);
+                auto& line  = set.lines.at(entry.line_id);
+                line.valid  = true;
+                line.tag    = entry.bank_req.tag;
+                --pending_fill_reqs_;
+            } break;
+            case bank_req_t::Replay: {
+                // send core response
+                if (!pipeline_req.write || config_.write_reponse) {
+                    for (auto& info : pipeline_req.ports) {
+                        if (!info.valid)
+                            continue;
+                        MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                        simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);  
+                        DT(3, simobject_->name() << "-core-" << core_rsp);         
+                    }
+                }
+            } break;
+            case bank_req_t::Core: {        
+                bool hit = false;
+                bool found_free_line = false;            
+                uint32_t hit_line_id = 0;
+                uint32_t repl_line_id = 0;            
+                uint32_t max_cnt = 0;
+
+                auto& set = bank.sets.at(pipeline_req.set_id);
+
+                // tag lookup                
+                for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
+                    auto& line = set.lines.at(i);
+                    if (line.valid) {
+                        if (line.tag == pipeline_req.tag) {
+                            line.lru_ctr = 0;                        
+                            hit_line_id = i;
+                            hit = true;
+                        } else {
+                            ++line.lru_ctr;
+                        }
+                        if (max_cnt < line.lru_ctr) {
+                            max_cnt = line.lru_ctr;
+                            repl_line_id = i;
+                        }
+                    } else {                    
+                        found_free_line = true;
+                        repl_line_id = i;
+                    }
+                }
+
+                if (hit) {     
+                    //
+                    // Hit handling   
+                    //                
+                    if (pipeline_req.write) {
+                        // handle write hit
+                        auto& hit_line = set.lines.at(hit_line_id);
+                        if (config_.write_through) {
+                            // forward write request to memory
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                        } else {
+                            // mark line as dirty
+                            hit_line.dirty = true;
+                        }
+                    }
+                    // send core response
+                    if (!pipeline_req.write || config_.write_reponse) {
+                        for (auto& info : pipeline_req.ports) {     
+                            if (!info.valid)
+                                continue;
+                            MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                            simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
+                            DT(3, simobject_->name() << "-core-" << core_rsp);
+                        }
+                    }
+                } else {     
+                    //
+                    // Miss handling   
+                    //
+                    if (pipeline_req.write)
+                        ++perf_stats_.write_misses;
+                    else
+                        ++perf_stats_.read_misses;
+
+                    if (!found_free_line && !config_.write_through) {
+                        // write back dirty line
+                        auto& repl_line = set.lines.at(repl_line_id);
+                        if (repl_line.dirty) {                       
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                            ++perf_stats_.evictions;
+                        }
+                    }
+
+                    if (pipeline_req.write && config_.write_through) {
+                        // forward write request to memory
+                        {
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                        }
+                        // send core response
+                        if (config_.write_reponse) {
+                            for (auto& info : pipeline_req.ports) {
+                                if (!info.valid)
+                                    continue;       
+                                MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                                simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
+                                DT(3, simobject_->name() << "-core-" << core_rsp);
+                            }
+                        }
+                    } else {
+                        // MSHR lookup
+                        auto mshr_pending = bank.mshr.lookup(pipeline_req);
+
+                        // allocate MSHR
+                        auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
+                        
+                        // send fill request
+                        if (!mshr_pending) {
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
+                            mem_req.write = false;
+                            mem_req.tag   = mshr_id;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                            ++pending_fill_reqs_;
+                        }
+                    }
+                }
+            } break;
+            }
+        }
+        // calculate memory latency
+        perf_stats_.mem_latency += pending_fill_reqs_;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config) 
+    : SimObject<CacheSim>(ctx, name)    
+    , CoreReqPorts(config.num_inputs, this)
+    , CoreRspPorts(config.num_inputs, this)
+    , MemReqPort(this)
+    , MemRspPort(this)
+    , impl_(new Impl(this, config))
+{}
+
+CacheSim::~CacheSim() {
+    delete impl_;
+}
+
+void CacheSim::reset() {
+    impl_->reset();
+}
+
+void CacheSim::tick() {
+    impl_->tick();
+}
+
+const CacheSim::PerfStats& CacheSim::perf_stats() const {
+    return impl_->perf_stats();
+}
--- a/sim/simx/cache_sim.h
+++ b/sim/simx/cache_sim.h
@@ -1,13 +1,27 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
-#include "memsim.h"
+#include "mem_sim.h"

 namespace vortex {

-class Cache : public SimObject<Cache> {
+class CacheSim : public SimObject<CacheSim> {
 public:
    struct Config {
+        bool    bypass;         // cache bypass
        uint8_t C;              // log2 cache size
        uint8_t B;              // log2 block size
        uint8_t W;              // log2 word size
@@ -45,6 +59,19 @@ public:
            , mshr_stalls(0)
            , mem_latency(0)
        {}
+
+        PerfStats& operator+=(const PerfStats& rhs) {
+            this->reads += rhs.reads;
+            this->writes += rhs.writes;
+            this->read_misses += rhs.read_misses;
+            this->write_misses += rhs.write_misses;
+            this->evictions += rhs.evictions;
+            this->pipeline_stalls += rhs.pipeline_stalls;
+            this->bank_stalls += rhs.bank_stalls;
+            this->mshr_stalls += rhs.mshr_stalls;
+            this->mem_latency += rhs.mem_latency;
+            return *this;
+        }
    };

    std::vector<SimPort<MemReq>> CoreReqPorts;
@@ -52,8 +79,8 @@ public:
    SimPort<MemReq>              MemReqPort;
    SimPort<MemRsp>              MemRspPort;

-    Cache(const SimContext& ctx, const char* name, const Config& config);
-    ~Cache();
+    CacheSim(const SimContext& ctx, const char* name, const Config& config);
+    ~CacheSim();

    void reset();
    
--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@@ -0,0 +1,222 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cluster.h"
+
+using namespace vortex;
+
+Cluster::Cluster(const SimContext& ctx, 
+                 uint32_t cluster_id,
+                 ProcessorImpl* processor, 
+                 const Arch &arch, const 
+                 DCRS &dcrs) 
+  : SimObject(ctx, "cluster")
+  , mem_req_port(this)
+  , mem_rsp_port(this)
+  , cluster_id_(cluster_id)
+  , cores_(arch.num_cores())  
+  , barriers_(arch.num_barriers(), 0)
+  , sharedmems_(arch.num_cores())
+  , processor_(processor)
+{
+  auto num_cores = arch.num_cores();
+  
+  char sname[100];
+  snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
+  l2cache_ = CacheSim::Create(sname, CacheSim::Config{
+    !L2_ENABLED,
+    log2ceil(L2_CACHE_SIZE), // C
+    log2ceil(MEM_BLOCK_SIZE), // B
+    log2ceil(L2_NUM_WAYS),  // W
+    0,                      // A
+    XLEN,                   // address bits  
+    L2_NUM_BANKS,           // number of banks
+    1,                      // number of ports
+    5,                      // request size 
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    L2_MSHR_SIZE,           // mshr
+    2,                      // pipeline latency
+  });
+
+  l2cache_->MemReqPort.bind(&this->mem_req_port);
+  this->mem_rsp_port.bind(&l2cache_->MemRspPort);
+
+  snprintf(sname, 100, "cluster%d-icaches", cluster_id);
+  icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
+    !ICACHE_ENABLED,
+    log2ceil(ICACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // B
+    log2ceil(sizeof(uint32_t)), // W
+    log2ceil(ICACHE_NUM_WAYS),// A
+    XLEN,                   // address bits    
+    1,                      // number of banks
+    1,                      // number of ports
+    1,                      // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    (uint8_t)arch.num_warps(), // mshr
+    2,                      // pipeline latency
+  });
+
+  icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
+  l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
+
+  snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
+  dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
+    !DCACHE_ENABLED,
+    log2ceil(DCACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // B
+    log2ceil(sizeof(Word)), // W
+    log2ceil(DCACHE_NUM_WAYS),// A
+    XLEN,                   // address bits    
+    DCACHE_NUM_BANKS,       // number of banks
+    1,                      // number of ports
+    DCACHE_NUM_BANKS,       // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    DCACHE_MSHR_SIZE,       // mshr
+    4,                      // pipeline latency
+  });
+
+  dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
+  l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
+
+  ///////////////////////////////////////////////////////////////////////////
+
+  // create shared memory blocks
+  for (uint32_t i = 0; i < num_cores; ++i) {
+    snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
+    sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
+      (1 << SMEM_LOG_SIZE),
+      sizeof(Word),
+      NUM_LSU_LANES, 
+      NUM_LSU_LANES,
+      false
+    });
+  }
+
+  // create cores
+
+  for (uint32_t i = 0; i < num_cores; ++i) {  
+    uint32_t core_id = cluster_id * num_cores + i;
+    cores_.at(i) = Core::Create(core_id, 
+                                this, 
+                                arch, 
+                                dcrs, 
+                                sharedmems_.at(i));
+
+    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
+    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      
+
+    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
+      snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
+      auto smem_demux = SMemDemux::Create(sname);
+      
+      cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
+      smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));        
+      
+      smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
+      dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
+
+      smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
+      sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
+    }
+  }
+}
+
+Cluster::~Cluster() {
+  //--
+}
+
+void Cluster::reset() {  
+  for (auto& barrier : barriers_) {
+    barrier.reset();
+  }
+}
+
+void Cluster::tick() {
+  //--
+}
+
+void Cluster::attach_ram(RAM* ram) {
+  for (auto core : cores_) {
+    core->attach_ram(ram);
+  }
+}
+
+bool Cluster::running() const {
+  for (auto& core : cores_) {
+    if (core->running())
+      return true;
+  }
+  return false;
+}
+
+bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
+  bool done = true;
+  Word exitcode_ = 0;
+  for (auto& core : cores_) {
+    Word ec;
+    if (core->check_exit(&ec, riscv_test)) {
+      exitcode_ |= ec;
+    } else {
+      done = false;
+    }
+  }
+  *exitcode = exitcode_;
+  return done;
+}
+
+void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
+  auto& barrier = barriers_.at(bar_id);
+
+  uint32_t local_core_id = core_id % cores_.size();
+  barrier.set(local_core_id);
+
+  DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
+
+  if (barrier.count() == (size_t)count) {
+      // resume all suspended cores
+      for (uint32_t i = 0; i < cores_.size(); ++i) {
+        if (barrier.test(i)) {
+          DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
+          cores_.at(i)->resume();
+        }
+      }
+      barrier.reset();
+    }
+}
+
+ProcessorImpl* Cluster::processor() const {
+  return processor_;
+}
+
+Cluster::PerfStats Cluster::perf_stats() const {
+  Cluster::PerfStats perf;
+  perf.icache = icaches_->perf_stats();
+  perf.dcache = dcaches_->perf_stats();    
+  perf.tcache = tcaches_->perf_stats();
+  perf.ocache = ocaches_->perf_stats();
+  perf.rcache = rcaches_->perf_stats();
+  perf.l2cache = l2cache_->perf_stats();
+
+  for (auto sharedmem : sharedmems_) {
+    perf.sharedmem += sharedmem->perf_stats();
+  }
+  
+  return perf;
+}
--- a/sim/simx/cluster.h
+++ b/sim/simx/cluster.h
@@ -0,0 +1,92 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "dcrs.h"
+#include "arch.h"
+#include "cache_cluster.h"
+#include "shared_mem.h"
+#include "core.h"
+#include "constants.h"
+
+namespace vortex {
+
+class ProcessorImpl;
+
+class Cluster : public SimObject<Cluster> {
+public:
+  struct PerfStats {
+    CacheSim::PerfStats   icache;
+    CacheSim::PerfStats   dcache;
+    SharedMem::PerfStats  sharedmem;
+    CacheSim::PerfStats   l2cache;
+    CacheSim::PerfStats   tcache;
+    CacheSim::PerfStats   ocache;
+    CacheSim::PerfStats   rcache;
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->icache      += rhs.icache;
+      this->dcache      += rhs.dcache;
+      this->sharedmem   += rhs.sharedmem;
+      this->l2cache     += rhs.l2cache;
+      this->tcache      += rhs.tcache;
+      this->ocache      += rhs.ocache;
+      this->rcache      += rhs.rcache;
+      return *this;
+    }
+  };
+
+  SimPort<MemReq> mem_req_port;
+  SimPort<MemRsp> mem_rsp_port;
+
+  Cluster(const SimContext& ctx, 
+          uint32_t cluster_id,
+          ProcessorImpl* processor, 
+          const Arch &arch, 
+          const DCRS &dcrs);
+
+  ~Cluster();
+
+  void reset();
+
+  void tick();
+
+  void attach_ram(RAM* ram);
+
+  bool running() const;
+
+  bool check_exit(Word* exitcode, bool riscv_test) const;  
+
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
+
+  ProcessorImpl* processor() const;
+
+  Cluster::PerfStats perf_stats() const;
+  
+private:
+  uint32_t                     cluster_id_;  
+  std::vector<Core::Ptr>       cores_;  
+  std::vector<CoreMask>        barriers_;
+  CacheSim::Ptr                l2cache_;
+  CacheCluster::Ptr            icaches_;
+  CacheCluster::Ptr            dcaches_;
+  std::vector<SharedMem::Ptr>  sharedmems_;
+  CacheCluster::Ptr            tcaches_;
+  CacheCluster::Ptr            ocaches_;
+  CacheCluster::Ptr            rcaches_;
+  ProcessorImpl*               processor_;
+};
+
+} // namespace vortex
--- a/sim/simx/constants.h
+++ b/sim/simx/constants.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #ifndef RAM_PAGE_SIZE
@@ -10,14 +23,4 @@

 #ifndef MEMORY_BANKS
 #define MEMORY_BANKS 2
-#endif
-
-namespace vortex {
-
-enum Constants {
-
-    SMEM_BANK_OFFSET = log2ceil(sizeof(uint32_t)) + log2ceil(STACK_SIZE / sizeof(uint32_t)),
-
-};
-
-}
+#endif
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <string>
@@ -11,101 +24,104 @@
 #include <simobject.h>
 #include "debug.h"
 #include "types.h"
-#include "archdef.h"
+#include "arch.h"
 #include "decode.h"
 #include "mem.h"
 #include "warp.h"
 #include "pipeline.h"
-#include "cache.h"
-#include "sharedmem.h"
+#include "cache_sim.h"
+#include "shared_mem.h"
 #include "ibuffer.h"
 #include "scoreboard.h"
-#include "exeunit.h"
-#include "tex_unit.h"
+#include "operand.h"
+#include "dispatcher.h"
+#include "exe_unit.h"
+#include "dcrs.h"

 namespace vortex {

+class Cluster;
+
 class Core : public SimObject<Core> {
 public:
  struct PerfStats {
+    uint64_t cycles;
    uint64_t instrs;
    uint64_t ibuf_stalls;
    uint64_t scrb_stalls;
    uint64_t alu_stalls;
    uint64_t lsu_stalls;
-    uint64_t csr_stalls;
    uint64_t fpu_stalls;
-    uint64_t gpu_stalls;
+    uint64_t sfu_stalls;
+    uint64_t ifetches;
    uint64_t loads;
    uint64_t stores;
-    uint64_t branches;
-    uint64_t mem_reads;
-    uint64_t mem_writes;
-    uint64_t mem_latency;
-    uint64_t tex_reads;
-    uint64_t tex_latency;
+    uint64_t ifetch_latency;
+    uint64_t load_latency;

    PerfStats() 
-      : instrs(0)
+      : cycles(0)
+      , instrs(0)
      , ibuf_stalls(0)
      , scrb_stalls(0)
      , alu_stalls(0)
      , lsu_stalls(0)
-      , csr_stalls(0)
      , fpu_stalls(0)
-      , gpu_stalls(0)
+      , sfu_stalls(0)
+      , ifetches(0)
      , loads(0)
      , stores(0)
-      , branches(0)
-      , mem_reads(0)
-      , mem_writes(0)
-      , mem_latency(0)
-      , tex_reads(0)
-      , tex_latency(0)
+      , ifetch_latency(0)
+      , load_latency(0)
    {}
  };

-  SimPort<MemRsp> MemRspPort;
-  SimPort<MemReq> MemReqPort;
+  std::vector<SimPort<MemReq>> icache_req_ports;
+  std::vector<SimPort<MemRsp>> icache_rsp_ports;
+
+  std::vector<SimPort<MemReq>> dcache_req_ports;
+  std::vector<SimPort<MemRsp>> dcache_rsp_ports;
+
+  Core(const SimContext& ctx, 
+       uint32_t core_id, 
+       Cluster* cluster,
+       const Arch &arch, 
+       const DCRS &dcrs,
+       SharedMem::Ptr  sharedmem);

-  Core(const SimContext& ctx, const ArchDef &arch, uint32_t id);
  ~Core();

-  void attach_ram(RAM* ram);
-
-  bool running() const;
-
  void reset();

  void tick();

+  void attach_ram(RAM* ram);
+
+  bool running() const;
+
+  void resume();
+
  uint32_t id() const {
-    return id_;
+    return core_id_;
  }

-  const Decoder& decoder() {
-    return decoder_;
-  }
-
-  const ArchDef& arch() const {
+  const Arch& arch() const {
    return arch_;
  }

-  const PerfStats& perf_stats() const {
-    return perf_stats_;
-  } 
-
-  uint32_t getIRegValue(int reg) const {
-    return warps_.at(0)->getIRegValue(reg);
+  const DCRS& dcrs() const {
+    return dcrs_;
  }

  uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
  
  void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);

-  WarpMask wspawn(uint32_t num_warps, uint32_t nextPC);
+  void wspawn(uint32_t num_warps, Word nextPC);
  
-  WarpMask barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
+
+  AddrType get_addr_type(uint64_t addr);

  void icache_read(void* data, uint64_t addr, uint32_t size);

@@ -113,19 +129,22 @@ public:

  void dcache_write(const void* data, uint64_t addr, uint32_t size);

-  uint32_t tex_read(uint32_t unit, uint32_t lod, uint32_t u, uint32_t v, std::vector<mem_addr_size_t>* mem_addrs);
+  void dcache_amo_reserve(uint64_t addr);
+
+  bool dcache_amo_check(uint64_t addr);

  void trigger_ecall();

  void trigger_ebreak();

-  bool check_exit() const;
+  bool check_exit(Word* exitcode, bool riscv_test) const;

 private:

  void schedule();
  void fetch();
  void decode();
+  void issue();
  void execute();
  void commit();
  
@@ -133,49 +152,55 @@ private:

  void cout_flush();

-  uint32_t id_;
-  const ArchDef arch_;
+  uint32_t core_id_;
+  const Arch& arch_;
+  const DCRS &dcrs_;
+  
  const Decoder decoder_;
  MemoryUnit mmu_;
-  RAM smem_;
-  std::vector<TexUnit> tex_units_;

  std::vector<std::shared_ptr<Warp>> warps_;  
-  std::vector<WarpMask> barriers_;  
-  std::vector<uint32_t> csrs_;
+  std::vector<WarpMask> barriers_;
  std::vector<Byte> fcsrs_;
  std::vector<IBuffer> ibuffers_;
  Scoreboard scoreboard_;
+  std::vector<Operand::Ptr> operands_;
+  std::vector<Dispatcher::Ptr> dispatchers_;
  std::vector<ExeUnit::Ptr> exe_units_;
-  Cache::Ptr icache_;
-  Cache::Ptr dcache_;
-  SharedMem::Ptr shared_mem_;
-  Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
-  std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
+  SharedMem::Ptr sharedmem_;

  PipelineLatch fetch_latch_;
  PipelineLatch decode_latch_;
  
  HashTable<pipeline_trace_t*> pending_icache_;
+  std::vector<pipeline_trace_t*> committed_traces_;
  WarpMask active_warps_;
  WarpMask stalled_warps_;
-  uint32_t last_schedule_wid_;
  uint64_t issued_instrs_;
  uint64_t committed_instrs_;
-  uint32_t csr_tex_unit_;
-  bool ecall_;
-  bool ebreak_;
+  bool exited_;
+
+  uint64_t pending_ifetches_;

  std::unordered_map<int, std::stringstream> print_bufs_;
+
+  std::vector<std::vector<CSRs>> csrs_;
  
  PerfStats perf_stats_;
-  uint64_t perf_mem_pending_reads_;
+  
+  Cluster* cluster_;

+  uint32_t commit_exe_;
+
+  friend class Warp;
  friend class LsuUnit;
  friend class AluUnit;
-  friend class CsrUnit;
  friend class FpuUnit;
-  friend class GpuUnit;
+  friend class SfuUnit;
+  friend class TexUnit;
+  friend class RasterAgent;
+  friend class RopAgent;
+  friend class TexAgent;
 };

-} // namespace vortex
+} // namespace vortex
--- a/sim/simx/dcrs.cpp
+++ b/sim/simx/dcrs.cpp
@@ -0,0 +1,28 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dcrs.h"
+#include <iostream>
+
+using namespace vortex;
+
+void DCRS::write(uint32_t addr, uint32_t value) {     
+  if (addr >= VX_DCR_BASE_STATE_BEGIN
+   && addr < VX_DCR_BASE_STATE_END) {
+      base_dcrs.write(addr, value);
+      return;
+  }
+
+  std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl;
+  std::abort();
+}
--- a/sim/simx/dcrs.h
+++ b/sim/simx/dcrs.h
@@ -0,0 +1,45 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <util.h>
+#include <VX_types.h>
+#include <array>
+
+namespace vortex {
+
+class BaseDCRS {
+public:
+    uint32_t read(uint32_t addr) const {
+        uint32_t state = VX_DCR_BASE_STATE(addr);
+        return states_.at(state);
+    }
+
+    void write(uint32_t addr, uint32_t value) {
+        uint32_t state = VX_DCR_BASE_STATE(addr);
+        states_.at(state) = value;
+    }
+
+private:    
+    std::array<uint32_t, VX_DCR_BASE_STATE_COUNT> states_;
+};
+
+class DCRS {
+public:
+    void write(uint32_t addr, uint32_t value);
+    
+    BaseDCRS base_dcrs;
+};
+
+}
--- a/sim/simx/debug.h
+++ b/sim/simx/debug.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #ifndef DEBUG_LEVEL
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <string>
 #include <stdlib.h>
@@ -9,41 +22,36 @@
 #include "debug.h"
 #include "types.h"
 #include "decode.h"
-#include "archdef.h"
+#include "arch.h"
 #include "instr.h"

 using namespace vortex;

-struct InstTableEntry_t {
-  bool controlFlow;
-  InstType iType;
-};
-
-static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable = {
-  {Opcode::NOP,        {false, InstType::N_TYPE}},
-  {Opcode::R_INST,     {false, InstType::R_TYPE}},
-  {Opcode::L_INST,     {false, InstType::I_TYPE}},
-  {Opcode::I_INST,     {false, InstType::I_TYPE}},
-  {Opcode::S_INST,     {false, InstType::S_TYPE}},
-  {Opcode::B_INST,     {true , InstType::B_TYPE}},
-  {Opcode::LUI_INST,   {false, InstType::U_TYPE}},
-  {Opcode::AUIPC_INST, {false, InstType::U_TYPE}},
-  {Opcode::JAL_INST,   {true , InstType::J_TYPE}},
-  {Opcode::JALR_INST,  {true , InstType::I_TYPE}},
-  {Opcode::SYS_INST,   {true , InstType::I_TYPE}},
-  {Opcode::FENCE,      {true , InstType::I_TYPE}},
-  {Opcode::FL,         {false, InstType::I_TYPE}},
-  {Opcode::FS,         {false, InstType::S_TYPE}},
-  {Opcode::FCI,        {false, InstType::R_TYPE}}, 
-  {Opcode::FMADD,      {false, InstType::R4_TYPE}},
-  {Opcode::FMSUB,      {false, InstType::R4_TYPE}},
-  {Opcode::FMNMADD,    {false, InstType::R4_TYPE}},
-  {Opcode::FMNMSUB,    {false, InstType::R4_TYPE}},  
-  {Opcode::VSET,       {false, InstType::V_TYPE}}, 
-  {Opcode::GPGPU,      {false, InstType::R_TYPE}},
-  {Opcode::GPU,        {false, InstType::R4_TYPE}},
-  {Opcode::R_INST_W,   {false, InstType::R_TYPE}},
-  {Opcode::I_INST_W,   {false, InstType::I_TYPE}},
+static const std::unordered_map<Opcode, InstType> sc_instTable = {
+  {Opcode::R_INST,     InstType::R_TYPE},
+  {Opcode::L_INST,     InstType::I_TYPE},
+  {Opcode::I_INST,     InstType::I_TYPE},
+  {Opcode::S_INST,     InstType::S_TYPE},
+  {Opcode::B_INST,     InstType::B_TYPE},
+  {Opcode::LUI_INST,   InstType::U_TYPE},
+  {Opcode::AUIPC_INST, InstType::U_TYPE},
+  {Opcode::JAL_INST,   InstType::J_TYPE},
+  {Opcode::JALR_INST,  InstType::I_TYPE},
+  {Opcode::SYS_INST,   InstType::I_TYPE},
+  {Opcode::FENCE,      InstType::I_TYPE},
+  {Opcode::AMO,        InstType::R_TYPE},
+  {Opcode::FL,         InstType::I_TYPE},
+  {Opcode::FS,         InstType::S_TYPE},
+  {Opcode::FCI,        InstType::R_TYPE}, 
+  {Opcode::FMADD,      InstType::R4_TYPE},
+  {Opcode::FMSUB,      InstType::R4_TYPE},
+  {Opcode::FMNMADD,    InstType::R4_TYPE},
+  {Opcode::FMNMSUB,    InstType::R4_TYPE},  
+  {Opcode::VSET,       InstType::V_TYPE},
+  {Opcode::EXT1,       InstType::R_TYPE},
+  {Opcode::EXT2,       InstType::R4_TYPE},
+  {Opcode::R_INST_W,   InstType::R_TYPE},
+  {Opcode::I_INST_W,   InstType::I_TYPE},
 };

 enum Constants {
@@ -58,6 +66,8 @@ enum Constants {
  width_i_imm = 12,
  width_j_imm = 20,
  width_v_imm = 11,
+  width_aq    = 1,
+  width_rl    = 1,

  shift_opcode= 0,
  shift_rd    = width_opcode,
@@ -72,15 +82,15 @@ enum Constants {
  shift_func6 = shift_func7 + width_vmask,
  shift_vset  = shift_func7 + width_func6,

-  mask_opcode = (1<<width_opcode)-1,  
-  mask_reg    = (1<<width_reg)-1,
-  mask_func2  = (1<<width_func2)-1,
-  mask_func3  = (1<<width_func3)-1,
-  mask_func6  = (1<<width_func6)-1,
-  mask_func7  = (1<<width_func7)-1,
-  mask_i_imm  = (1<<width_i_imm)-1,
-  mask_j_imm  = (1<<width_j_imm)-1,
-  mask_v_imm  = (1<<width_v_imm)-1,
+  mask_opcode = (1 << width_opcode) - 1,  
+  mask_reg    = (1 << width_reg)   - 1,
+  mask_func2  = (1 << width_func2) - 1,
+  mask_func3  = (1 << width_func3) - 1,
+  mask_func6  = (1 << width_func6) - 1,
+  mask_func7  = (1 << width_func7) - 1,
+  mask_i_imm  = (1 << width_i_imm) - 1,
+  mask_j_imm  = (1 << width_j_imm) - 1,
+  mask_v_imm  = (1 << width_v_imm) - 1,
 };

 static const char* op_string(const Instr &instr) {
@@ -92,7 +102,6 @@ static const char* op_string(const Instr &instr) {
  auto imm    = instr.getImm();

  switch (opcode) {
-  case Opcode::NOP:        return "NOP";
  case Opcode::LUI_INST:   return "LUI";
  case Opcode::AUIPC_INST: return "AUIPC";
  case Opcode::R_INST:
@@ -116,7 +125,7 @@ static const char* op_string(const Instr &instr) {
      case 2: return "SLT";
      case 3: return "SLTU";
      case 4: return "XOR";
-      case 5: return func7 ? "SRA" : "SRL";
+      case 5: return (func7 & 0x20) ? "SRA" : "SRL";
      case 6: return "OR";
      case 7: return "AND";
      default:
@@ -130,7 +139,7 @@ static const char* op_string(const Instr &instr) {
    case 2: return "SLTI";
    case 3: return "SLTIU";
    case 4: return "XORI";
-    case 5: return func7 ? "SRAI" : "SRLI";
+    case 5: return (func7 & 0x20) ? "SRAI" : "SRLI";
    case 6: return "ORI";
    case 7: return "ANDI";
    default:
@@ -151,8 +160,8 @@ static const char* op_string(const Instr &instr) {
  case Opcode::JALR_INST:  return "JALR";
  case Opcode::L_INST:
    switch (func3) {
-    case 0: return "LBI";
-    case 1: return "LHI";
+    case 0: return "LB";
+    case 1: return "LH";
    case 2: return "LW";
    case 3: return "LD";
    case 4: return "LBU";
@@ -192,11 +201,11 @@ static const char* op_string(const Instr &instr) {
    }
  case Opcode::I_INST_W:
    switch (func3) {
-      case 0: return "ADDIW";
-      case 1: return "SLLIW";
-      case 5: return func7 ? "SRAIW" : "SRLIW";
-      default:
-        std::abort();
+    case 0: return "ADDIW";
+    case 1: return "SLLIW";
+    case 5: return func7 ? "SRAIW" : "SRLIW";
+    default:
+      std::abort();
    }
  case Opcode::SYS_INST: 
    switch (func3) {
@@ -222,20 +231,59 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FENCE: return "FENCE";
  case Opcode::FL: 
    switch (func3) {
-      case 0x1: return "VL";
-      case 0x2: return "FLW";
-      case 0x3: return "FLD";
-      default: 
-        std::abort();
+    case 0x1: return "VL";
+    case 0x2: return "FLW";
+    case 0x3: return "FLD";
+    default: 
+      std::abort();
    }
  case Opcode::FS: 
    switch (func3) {
-      case 0x1: return "VS";
-      case 0x2: return "FSW";
-      case 0x3: return "FSD";
+    case 0x1: return "VS";
+    case 0x2: return "FSW";
+    case 0x3: return "FSD";
+    default: 
+      std::abort();
+    }
+  case Opcode::AMO: {
+    auto amo_type = func7 >> 2;
+    switch (func3) {
+      case 0x2:
+        switch (amo_type) {
+        case 0x00: return "AMOADD.W";
+        case 0x01: return "AMOSWAP.W";
+        case 0x02: return "LR.W";
+        case 0x03: return "SC.W";
+        case 0x04: return "AMOXOR.W";
+        case 0x08: return "AMOOR.W";
+        case 0x0c: return "AMOAND.W";
+        case 0x10: return "AMOMIN.W";
+        case 0x14: return "AMOMAX.W";
+        case 0x18: return "AMOMINU.W";
+        case 0x1c: return "AMOMAXU.W";
+        default:
+          std::abort();
+        }
+      case 0x3:
+        switch (amo_type) {
+        case 0x00: return "AMOADD.D";
+        case 0x01: return "AMOSWAP.D";
+        case 0x02: return "LR.D";
+        case 0x03: return "SC.D";
+        case 0x04: return "AMOXOR.D";
+        case 0x08: return "AMOOR.D";
+        case 0x0c: return "AMOAND.D";
+        case 0x10: return "AMOMIN.D";
+        case 0x14: return "AMOMAX.D";
+        case 0x18: return "AMOMINU.D";
+        case 0x1c: return "AMOMAXU.D";
+        default:
+          std::abort();
+        }
      default: 
        std::abort();
    }
+  }
  case Opcode::FCI: 
    switch (func7) {
    case 0x00: return "FADD.S";
@@ -332,9 +380,9 @@ static const char* op_string(const Instr &instr) {
      default:
        std::abort();
      }
-    case 0x70: return func3 ? "FCLASS.S" : "FMV.X.W";
+    case 0x70: return func3 ? "FCLASS.S" : "FMV.X.S";
    case 0x71: return func3 ? "FCLASS.D" : "FMV.X.D";
-    case 0x78: return "FMV.W.X";
+    case 0x78: return "FMV.S.X";
    case 0x79: return "FMV.D.X";
    default:
      std::abort();
@@ -344,23 +392,36 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
  case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
  case Opcode::VSET:    return "VSET";
-  case Opcode::GPGPU:
-    switch (func3) {            
-    case 0: return "TMC";
-    case 1: return "WSPAWN";
-    case 2: return "SPLIT";
-    case 3: return "JOIN";
-    case 4: return "BAR";
-    case 5: return "PREFETCH";
+  case Opcode::EXT1:
+    switch (func7) {
+    case 0:
+      switch (func3) {            
+      case 0: return "TMC";
+      case 1: return "WSPAWN";
+      case 2: return "SPLIT";
+      case 3: return "JOIN";
+      case 4: return "BAR";
+      case 5: return "PRED";
+      default:
+        std::abort();
+      }
+    case 1:
+      switch (func3) {
+      case 0: return "RASTER";      
+      default:
+        std::abort();
+      }
    default:
      std::abort();
    }
-  case Opcode::GPU:
+  case Opcode::EXT2:
    switch (func3) {
-    case 0: return "TEX";
+    case 0:
+      return "TEX";
    case 1: {
      switch (func2) {
      case 0: return "CMOV";
+      case 1: return "ROP";      
      default:
        std::abort();
      }
@@ -375,43 +436,36 @@ static const char* op_string(const Instr &instr) {

 namespace vortex {
 std::ostream &operator<<(std::ostream &os, const Instr &instr) {  
-  auto opcode = instr.getOpcode();    
-  auto func2  = instr.getFunc2();
+  auto opcode = instr.getOpcode();
  auto func3  = instr.getFunc3();

-  os << op_string(instr) << ": ";
-
-  if (opcode == S_INST 
-   || opcode == FS) {     
-     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
-     os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
-  } else 
-  if (opcode == L_INST 
-   || opcode == FL) {     
-     os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
-     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
-  } else {
-    if (instr.getRDType() != RegType::None) {
-      os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
-    }
-    uint32_t i = 0;
-    for (; i < instr.getNRSrc(); ++i) {    
-      if (i) os << ", ";
-      os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
-    }    
-    if (instr.hasImm()) {
-      if (i) os << ", ";
-      os << "imm=0x" << std::hex << instr.getImm();
-    }
-    if (opcode == GPU && func3 == 0) {
-      os << ", unit=" << std::dec << func2;
-    }
+  os << op_string(instr);
+  
+  int sep = 0;
+  if (instr.getRDType() != RegType::None) {
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << instr.getRDType() << std::dec << instr.getRDest();
+  }
+  for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {    
+    if (instr.getRSType(i) == RegType::None)
+      continue;
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
+  }
+  if (instr.hasImm()) {
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << "0x" << std::hex << instr.getImm();
+  }
+  if (opcode == Opcode::SYS_INST && func3 >= 5) {
+    // CSRs with immediate values
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << "0x" << std::hex << instr.getRSrc(0);
  }
  return os;
 }
 }

-Decoder::Decoder(const ArchDef&) {}
+Decoder::Decoder(const Arch&) {}

 std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {  
  auto instr = std::make_shared<Instr>();
@@ -434,7 +488,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    return nullptr;
  }

-  auto iType = op_it->second.iType;
+  auto iType = op_it->second;
  if (op == Opcode::FL || op == Opcode::FS) { 
    if (func3 != 0x2 && func3 != 0x3) {
      iType = InstType::V_TYPE;
@@ -442,57 +496,97 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  }

  switch (iType) {
-  case InstType::N_TYPE:
-    break;
-
  case InstType::R_TYPE:
-    if (op == Opcode::FCI) {
-      switch (func7) {      
+    switch (op) {
+    case Opcode::FCI:
+      switch (func7) {  
+      case 0x2c: // FSQRT.S
+      case 0x2d: // FSQRT.D
+        instr->setDestReg(rd, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
+        break;    
      case 0x50: // FLE.S, FLT.S, FEQ.S
      case 0x51: // FLE.D, FLT.D, FEQ.D
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::Float);
        break;
      case 0x60: // FCVT.W.D, FCVT.WU.D, FCVT.L.D, FCVT.LU.D
      case 0x61: // FCVT.WU.S, FCVT.W.S, FCVT.L.S, FCVT.LU.S
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::None);
        break;
      case 0x68: // FCVT.S.W, FCVT.S.WU, FCVT.S.L, FCVT.S.LU
      case 0x69: // FCVT.D.W, FCVT.D.WU, FCVT.D.L, FCVT.D.LU
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Integer);
-        instr->setSrcReg(rs2, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Integer);
+        instr->addSrcReg(rs2, RegType::None);
        break;
-      case 0x70: // FCLASS.S, FMV.X.W
+      case 0x70: // FCLASS.S, FMV.X.S
      case 0x71: // FCLASS.D, FMV.X.D        
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
        break;
-      case 0x78: // FMV.W.X
+      case 0x78: // FMV.S.X
      case 0x79: // FMV.D.X        
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Integer);
        break;
      default:
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Float);        
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::Float);        
        break;
      }
-    } else {
+      break;
+    case Opcode::EXT1:
+      switch (func7) {
+      case 0:
+        switch (func3) {         
+        case 0: // TMC
+        case 3: // JOIN
+          instr->addSrcReg(rs1, RegType::Integer);
+          break;
+        case 1: // WSPAWN        
+        case 4: // BAR
+        case 5: // PRED
+          instr->addSrcReg(rs1, RegType::Integer);
+          instr->addSrcReg(rs2, RegType::Integer);
+          break;
+        case 2: // SPLIT
+          instr->setDestReg(rd, RegType::Integer);
+          instr->addSrcReg(rs1, RegType::Integer);
+          break;
+        default:
+          std::abort();
+        }
+        break;
+      case 1:
+        switch (func3) {
+        case 0: // RASTER
+          instr->setDestReg(rd, RegType::Integer);
+          break;
+        default:
+          std::abort();
+        }
+        break;
+      default:
+        std::abort();
+      }
+      break;
+    default:
      instr->setDestReg(rd, RegType::Integer);
-      instr->setSrcReg(rs1, RegType::Integer);
-      instr->setSrcReg(rs2, RegType::Integer);
+      instr->addSrcReg(rs1, RegType::Integer);
+      instr->addSrcReg(rs2, RegType::Integer);
+      break;
    }
    instr->setFunc3(func3);
    instr->setFunc7(func7);
    break;

  case InstType::I_TYPE: {
-    instr->setSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
    if (op == Opcode::FL) {
      instr->setDestReg(rd, RegType::Float);      
    } else {
@@ -503,15 +597,23 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    switch (op) {
    case Opcode::SYS_INST:
      if (func3 != 0) {
-        // RV32I: CSR*
-        instr->setDestReg(rd, RegType::Integer);
-      }
+        // RV32I: CSR
+        if (func3 >= 5) {
+          // rs1 holds zimm
+          instr->setSrcReg(0, rs1, RegType::None);
+        }        
+      } else {        
+        instr->setDestReg(rd, RegType::None);
+        instr->setSrcReg(0, rs1, RegType::None);
+      }      
      // uint12
      instr->setImm(code >> shift_rs2);
      break;
    case Opcode::FENCE:
      // uint12
      instr->setImm(code >> shift_rs2);
+      instr->setDestReg(rd, RegType::None);
+      instr->setSrcReg(0, rs1, RegType::None);
      break;
    case Opcode::I_INST:
    case Opcode::I_INST_W:
@@ -538,11 +640,11 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    }
  } break;
  case InstType::S_TYPE: {    
-    instr->setSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
    if (op == Opcode::FS) {
-      instr->setSrcReg(rs2, RegType::Float);
+      instr->addSrcReg(rs2, RegType::Float);
    } else {
-      instr->setSrcReg(rs2, RegType::Integer);
+      instr->addSrcReg(rs2, RegType::Integer);
    }
    instr->setFunc3(func3);
    auto imm = (func7 << width_reg) | rd;
@@ -550,8 +652,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  } break;

  case InstType::B_TYPE: {
-    instr->setSrcReg(rs1, RegType::Integer);
-    instr->setSrcReg(rs2, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs2, RegType::Integer);
    instr->setFunc3(func3);
    auto bit_11   = rd & 0x1;
    auto bits_4_1 = rd >> 1;
@@ -581,8 +683,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  case InstType::V_TYPE:
    switch (op) {
    case Opcode::VSET: {
-      instr->setDestVReg(rd);
-      instr->setSrcVReg(rs1);
+      instr->setDestReg(rd, RegType::Vector);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setFunc3(func3);
      if (func3 == 7) {
        instr->setImm(!(code >> shift_vset));
@@ -593,20 +695,20 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
          instr->setVediv((immed >> 4) & 0x3);
          instr->setVsew((immed >> 2) & 0x3);
        } else {
-          instr->setSrcVReg(rs2);
+          instr->addSrcReg(rs2, RegType::Vector);
        }
      } else {
-        instr->setSrcVReg(rs2);
+        instr->addSrcReg(rs2, RegType::Vector);
        instr->setVmask((code >> shift_func7) & 0x1);
        instr->setFunc6(func6);
      }
    } break;

    case Opcode::FL:
-      instr->setDestVReg(rd);
-      instr->setSrcVReg(rs1);
+      instr->setDestReg(rd, RegType::Vector);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setVlsWidth(func3);
-      instr->setSrcVReg(rs2);
+      instr->addSrcReg(rs2, RegType::Vector);
      instr->setVmask(code >> shift_func7);
      instr->setVmop((code >> shift_vmop) & mask_func3);
      instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -614,9 +716,9 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {

    case Opcode::FS:
      instr->setVs3(rd);
-      instr->setSrcVReg(rs1);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setVlsWidth(func3);
-      instr->setSrcVReg(rs2);
+      instr->addSrcReg(rs2, RegType::Vector);
      instr->setVmask(code >> shift_func7);
      instr->setVmop((code >> shift_vmop) & mask_func3);
      instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -627,16 +729,28 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    }
    break;
  case R4_TYPE:
-    if (op == Opcode::GPU) {
-      instr->setDestReg(rd, RegType::Integer);
-      instr->setSrcReg(rs1, RegType::Integer);
-      instr->setSrcReg(rs2, RegType::Integer);
-      instr->setSrcReg(rs3, RegType::Integer);
+    if (op == Opcode::EXT2) {
+      switch (func3) {
+      case 1:
+        switch (func2) {
+        case 0: // CMOV
+          instr->setDestReg(rd, RegType::Integer);
+          instr->addSrcReg(rs1, RegType::Integer);
+          instr->addSrcReg(rs2, RegType::Integer);
+          instr->addSrcReg(rs3, RegType::Integer);
+          break;
+        default:
+          std::abort();
+        }
+        break;
+      default:
+        std::abort();
+      }
    } else {
      instr->setDestReg(rd, RegType::Float);
-      instr->setSrcReg(rs1, RegType::Float);
-      instr->setSrcReg(rs2, RegType::Float);
-      instr->setSrcReg(rs3, RegType::Float);
+      instr->addSrcReg(rs1, RegType::Float);
+      instr->addSrcReg(rs2, RegType::Float);
+      instr->addSrcReg(rs3, RegType::Float);
    }
    instr->setFunc2(func2);
    instr->setFunc3(func3);
--- a/sim/simx/decode.h
+++ b/sim/simx/decode.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <vector>
@@ -5,12 +18,12 @@

 namespace vortex {

-class ArchDef;
+class Arch;
 class Instr;

 class Decoder {
 public:
-  Decoder(const ArchDef &);    
+  Decoder(const Arch &);    
  
  std::shared_ptr<Instr> decode(uint32_t code) const;
 };
--- a/sim/simx/dispatcher.h
+++ b/sim/simx/dispatcher.h
@@ -0,0 +1,141 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pipeline.h"
+#include <queue>
+
+namespace vortex {
+
+class Dispatcher : public SimObject<Dispatcher> {
+public:
+    std::vector<SimPort<pipeline_trace_t*>> Outputs;
+
+    Dispatcher(const SimContext& ctx, const Arch& arch, uint32_t buf_size, uint32_t block_size, uint32_t num_lanes) 
+        : SimObject<Dispatcher>(ctx, "Dispatcher") 
+        , Outputs(ISSUE_WIDTH, this)
+        , Inputs_(ISSUE_WIDTH, this)
+        , arch_(arch)
+        , queues_(ISSUE_WIDTH, std::queue<pipeline_trace_t*>())
+        , buf_size_(buf_size)        
+        , block_size_(block_size)        
+        , num_lanes_(num_lanes)        
+        , batch_count_(ISSUE_WIDTH / block_size)
+        , pid_count_(arch.num_threads() / num_lanes)
+        , batch_idx_(0)
+        , start_p_(block_size, 0)
+    {}
+    
+    virtual ~Dispatcher() {}
+
+    virtual void reset() {
+        batch_idx_ = 0;
+        for (uint32_t b = 0; b < block_size_; ++b) {
+            start_p_.at(b) = 0;
+        }
+    }
+
+    virtual void tick() {
+        for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+            auto& queue = queues_.at(i);
+            if (queue.empty())
+                continue;
+            auto trace = queue.front();
+            Inputs_.at(i).send(trace, 1);
+            queue.pop();
+        }
+
+        uint32_t block_sent = 0;
+        for (uint32_t b = 0; b < block_size_; ++b) {
+            uint32_t i = batch_idx_ * block_size_ + b;
+            auto& input = Inputs_.at(i);            
+            if (input.empty()) {
+                ++block_sent;
+                continue;
+            }
+            auto& output = Outputs.at(i);
+            auto trace = input.front();
+            if (pid_count_ != 1) {
+                auto start_p = start_p_.at(b);
+                if (start_p == -1) {
+                    ++block_sent;
+                    continue;       
+                }             
+                int start(-1), end(-1);
+                for (uint32_t j = start_p * num_lanes_, n = arch_.num_threads(); j < n; ++j) {
+                    if (!trace->tmask.test(j))
+                        continue;
+                    if (start == -1)
+                        start = j;
+                    end = j;
+                }                
+                start /= num_lanes_;
+                end /= num_lanes_;
+                auto new_trace = new pipeline_trace_t(*trace);
+                new_trace->tmask.reset();
+                for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
+                    new_trace->tmask[j] = trace->tmask[j];
+                }                
+                new_trace->pid = start;
+                new_trace->sop = (start_p == 0);
+                if (start == end) {
+                    new_trace->eop = 1;
+                    start_p_.at(b) = -1;
+                    input.pop();
+                    ++block_sent;
+                    delete trace;
+                } else {
+                    new_trace->eop = 0;
+                    start_p_.at(b) = start + 1;
+                }                
+                output.send(new_trace, 1);
+                DT(3, "pipeline-dispatch: " << *new_trace);
+            } else {
+                trace->pid = 0;
+                input.pop();
+                output.send(trace, 1);
+                DT(3, "pipeline-dispatch: " << *trace);
+                ++block_sent;
+            }            
+        }
+        if (block_sent == block_size_) {
+            batch_idx_ = (batch_idx_ + 1) % batch_count_;
+            for (uint32_t b = 0; b < block_size_; ++b) {
+                start_p_.at(b) = 0;
+            }
+        }
+    };
+
+    bool push(uint32_t issue_index, pipeline_trace_t* trace) {
+        auto& queue = queues_.at(issue_index);
+        if (queue.size() >= buf_size_)
+            return false;
+        queue.push(trace);        
+        return true;
+    }
+
+private:
+    std::vector<SimPort<pipeline_trace_t*>> Inputs_;
+    const Arch& arch_;
+    std::vector<std::queue<pipeline_trace_t*>> queues_;
+    uint32_t buf_size_;
+    uint32_t block_size_;
+    uint32_t num_lanes_;
+    uint32_t batch_count_;
+    uint32_t pid_count_;
+    uint32_t batch_idx_;
+    std::vector<int> start_p_;
+};
+
+}
--- a/sim/simx/exe_unit.cpp
+++ b/sim/simx/exe_unit.cpp
@@ -0,0 +1,341 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "exe_unit.h"
+#include <iostream>
+#include <iomanip>
+#include <string.h>
+#include <assert.h>
+#include <util.h>
+#include "debug.h"
+#include "core.h"
+#include "constants.h"
+#include "cache_sim.h"
+
+using namespace vortex;
+
+AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
+    
+void AluUnit::tick() {    
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        auto& input = Inputs.at(i);
+        if (input.empty()) 
+            continue;
+        auto& output = Outputs.at(i);
+        auto trace = input.front();
+        switch (trace->alu_type) {
+        case AluType::ARITH:        
+        case AluType::BRANCH:
+        case AluType::SYSCALL:
+        case AluType::IMUL:
+            output.send(trace, LATENCY_IMUL+1);
+            break;
+        case AluType::IDIV:
+            output.send(trace, XLEN+1);
+            break;
+        default:
+            std::abort();
+        }
+        DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
+        if (trace->eop && trace->fetch_stall) {
+            assert(core_->stalled_warps_.test(trace->wid));
+            core_->stalled_warps_.reset(trace->wid);
+        }
+        auto time = input.pop();
+        core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
+    
+void FpuUnit::tick() {
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        auto& input = Inputs.at(i);
+        if (input.empty()) 
+            continue;
+        auto& output = Outputs.at(i);
+        auto trace = input.front();
+        switch (trace->fpu_type) {
+        case FpuType::FNCP:
+            output.send(trace, 2);
+            break;
+        case FpuType::FMA:
+            output.send(trace, LATENCY_FMA+1);
+            break;
+        case FpuType::FDIV:
+            output.send(trace, LATENCY_FDIV+1);
+            break;
+        case FpuType::FSQRT:
+            output.send(trace, LATENCY_FSQRT+1);
+            break;
+        case FpuType::FCVT:
+            output.send(trace, LATENCY_FCVT+1);
+            break;
+        default:
+            std::abort();
+        }    
+        DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
+        auto time = input.pop();
+        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
+    : ExeUnit(ctx, core, "LSU")
+    , pending_rd_reqs_(LSUQ_SIZE)
+    , num_lanes_(NUM_LSU_LANES)     
+    , pending_loads_(0)
+    , fence_lock_(false)
+    , input_idx_(0)
+{}
+
+void LsuUnit::reset() {
+    pending_rd_reqs_.clear();
+    pending_loads_ = 0;
+    fence_lock_ = false;
+}
+
+void LsuUnit::tick() {    
+    core_->perf_stats_.load_latency += pending_loads_;
+
+    // handle dcache response    
+    for (uint32_t t = 0; t < num_lanes_; ++t) {
+        auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
+        if (dcache_rsp_port.empty())
+            continue;
+        auto& mem_rsp = dcache_rsp_port.front();
+        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
+        auto trace = entry.trace;
+        DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type 
+            << ", tid=" << t << ", " << *trace);  
+        assert(entry.count);
+        --entry.count; // track remaining addresses 
+        if (0 == entry.count) {
+            int iw = trace->wid % ISSUE_WIDTH;
+            auto& output = Outputs.at(iw);
+            output.send(trace, 1);
+            pending_rd_reqs_.release(mem_rsp.tag);
+        } 
+        dcache_rsp_port.pop();
+        --pending_loads_;
+    }
+
+    // handle shared memory response
+    for (uint32_t t = 0; t < num_lanes_; ++t) {
+        auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
+        if (smem_rsp_port.empty())
+            continue;
+        auto& mem_rsp = smem_rsp_port.front();
+        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
+        auto trace = entry.trace;
+        DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
+        assert(entry.count);
+        --entry.count; // track remaining addresses 
+        if (0 == entry.count) {
+            int iw = trace->wid % ISSUE_WIDTH;
+            auto& output = Outputs.at(iw);
+            output.send(trace, 1);
+            pending_rd_reqs_.release(mem_rsp.tag);
+        } 
+        smem_rsp_port.pop();  
+        --pending_loads_;
+    }
+
+    if (fence_lock_) {
+        // wait for all pending memory operations to complete
+        if (!pending_rd_reqs_.empty())
+            return;
+        int iw = fence_state_->wid % ISSUE_WIDTH;
+        auto& output = Outputs.at(iw);
+        output.send(fence_state_, 1);
+        fence_lock_ = false;
+        DT(3, "fence-unlock: " << fence_state_);
+    }    
+
+    // check input queue
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        int iw = (input_idx_ + i) % ISSUE_WIDTH;
+        auto& input = Inputs.at(iw);
+        if (input.empty())
+            continue;
+        auto& output = Outputs.at(iw);
+        auto trace = input.front();
+        auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
+
+        auto t0 = trace->pid * num_lanes_;
+
+        if (trace->lsu_type == LsuType::FENCE) {
+            // schedule fence lock
+            fence_state_ = trace;
+            fence_lock_ = true;        
+            DT(3, "fence-lock: " << *trace);
+            // remove input
+            auto time = input.pop(); 
+            core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+            break;
+        }
+
+        // check pending queue capacity    
+        if (pending_rd_reqs_.full()) {
+            if (!trace->log_once(true)) {
+                DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
+            }
+            break;
+        } else {
+            trace->log_once(false);
+        }
+        
+        bool is_write = (trace->lsu_type == LsuType::STORE);
+
+        // duplicates detection
+        bool is_dup = false;
+        if (trace->tmask.test(t0)) {
+            uint64_t addr_mask = sizeof(uint32_t)-1;
+            uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
+            uint32_t matches = 1;
+            for (uint32_t t = 1; t < num_lanes_; ++t) {
+                if (!trace->tmask.test(t0 + t))
+                    continue;
+                auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
+                matches += (addr0 == mem_addr);
+            }
+            is_dup = (matches == trace->tmask.count());
+        }
+
+        uint32_t addr_count;
+        if (is_dup) {
+            addr_count = 1;
+        } else {
+            addr_count = trace->tmask.count();
+        }
+
+        auto tag = pending_rd_reqs_.allocate({trace, addr_count});
+
+        for (uint32_t t = 0; t < num_lanes_; ++t) {
+            if (!trace->tmask.test(t0 + t))
+                continue;
+            
+            auto& dcache_req_port = core_->dcache_req_ports.at(t);
+            auto mem_addr = trace_data->mem_addrs.at(t);
+            auto type = core_->get_addr_type(mem_addr.addr);
+
+            MemReq mem_req;
+            mem_req.addr  = mem_addr.addr;
+            mem_req.write = is_write;
+            mem_req.type  = type; 
+            mem_req.tag   = tag;
+            mem_req.cid   = trace->cid;
+            mem_req.uuid  = trace->uuid;        
+                
+            dcache_req_port.send(mem_req, 2);
+            DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag 
+                << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
+
+            ++pending_loads_;
+            ++core_->perf_stats_.loads;        
+            if (is_dup)
+                break;
+        }
+
+        // do not wait on writes
+        if (is_write) {
+            pending_rd_reqs_.release(tag);
+            output.send(trace, 1);
+            ++core_->perf_stats_.stores;
+        }
+
+        // remove input
+        auto time = input.pop();
+        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+
+        break; // single block
+    }
+    ++input_idx_;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+SfuUnit::SfuUnit(const SimContext& ctx, Core* core) 
+    : ExeUnit(ctx, core, "SFU")
+{}
+    
+void SfuUnit::tick() {
+    // handle pending responses
+    for (auto pending_rsp : pending_rsps_) {
+        if (pending_rsp->empty())
+            continue;
+        auto trace = pending_rsp->front();
+        if (trace->cid != core_->id())
+            continue;
+        int iw = trace->wid % ISSUE_WIDTH;
+        auto& output = Outputs.at(iw);
+        output.send(trace, 1);
+        pending_rsp->pop();
+    }
+
+    // check input queue
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        int iw = (input_idx_ + i) % ISSUE_WIDTH;        
+        auto& input = Inputs.at(iw);
+        if (input.empty())
+            continue;
+        auto& output = Outputs.at(iw);
+        auto trace = input.front();
+        auto sfu_type = trace->sfu_type;
+        bool release_warp = trace->fetch_stall;
+
+        switch  (sfu_type) {
+        case SfuType::TMC: 
+        case SfuType::WSPAWN:
+        case SfuType::SPLIT:
+        case SfuType::JOIN:
+        case SfuType::PRED:
+        case SfuType::CSRRW:
+        case SfuType::CSRRS:
+        case SfuType::CSRRC:
+            output.send(trace, 1);
+            break;
+        case SfuType::BAR: {
+            output.send(trace, 1);
+            auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
+            if (trace->eop) {
+                core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
+            }
+            release_warp = false;
+        }   break;
+        case SfuType::CMOV:
+            output.send(trace, 3);
+            break;
+        default:
+            std::abort();
+        }
+
+        DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
+        if (trace->eop && release_warp)  {
+            assert(core_->stalled_warps_.test(trace->wid));
+            core_->stalled_warps_.reset(trace->wid);
+        }
+
+        auto time = input.pop();
+        auto stalls = (SimPlatform::instance().cycles() - time);
+
+        core_->perf_stats_.sfu_stalls += stalls;
+
+        break; // single block
+    }
+    ++input_idx_;
+}
--- a/sim/simx/exe_unit.h
+++ b/sim/simx/exe_unit.h
@@ -1,8 +1,21 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
 #include "pipeline.h"
-#include "cache.h"
+#include "cache_sim.h"

 namespace vortex {

@@ -10,13 +23,13 @@ class Core;

 class ExeUnit : public SimObject<ExeUnit> {
 public:
-    SimPort<pipeline_trace_t*> Input;
-    SimPort<pipeline_trace_t*> Output;
+    std::vector<SimPort<pipeline_trace_t*>> Inputs;
+    std::vector<SimPort<pipeline_trace_t*>> Outputs;

    ExeUnit(const SimContext& ctx, Core* core, const char* name) 
        : SimObject<ExeUnit>(ctx, name) 
-        , Input(this)
-        , Output(this)
+        , Inputs(ISSUE_WIDTH, this)
+        , Outputs(ISSUE_WIDTH, this)
        , core_(core)
    {}
    
@@ -32,28 +45,25 @@ protected:

 ///////////////////////////////////////////////////////////////////////////////

-class NopUnit : public ExeUnit {
-public:
-    NopUnit(const SimContext& ctx, Core*);
-    
-    void tick();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
 class LsuUnit : public ExeUnit {
-private:    
-    uint32_t num_threads_;
-    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
-    pipeline_trace_t* fence_state_;
-    bool fence_lock_;
-
 public:
    LsuUnit(const SimContext& ctx, Core*);

    void reset();

    void tick();
+
+private:    
+    struct pending_req_t {
+      pipeline_trace_t* trace;
+      uint32_t count;
+    };
+    HashTable<pending_req_t> pending_rd_reqs_;    
+    uint32_t num_lanes_;
+    pipeline_trace_t* fence_state_;
+    uint64_t pending_loads_;
+    bool fence_lock_;
+    uint32_t input_idx_;
 };

 ///////////////////////////////////////////////////////////////////////////////
@@ -67,15 +77,6 @@ public:

 ///////////////////////////////////////////////////////////////////////////////

-class CsrUnit : public ExeUnit {
-public:
-    CsrUnit(const SimContext& ctx, Core*);
-    
-    void tick();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
 class FpuUnit : public ExeUnit {
 public:
    FpuUnit(const SimContext& ctx, Core*);
@@ -85,19 +86,15 @@ public:

 ///////////////////////////////////////////////////////////////////////////////

-class GpuUnit : public ExeUnit {
-private:
-    uint32_t num_threads_;
-    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
-
-    bool processTexRequest(pipeline_trace_t* trace);
-    
+class SfuUnit : public ExeUnit {
 public:
-    GpuUnit(const SimContext& ctx, Core*);
-
-    void reset();
+    SfuUnit(const SimContext& ctx, Core*);
    
    void tick();
+
+private:
+  std::vector<SimPort<pipeline_trace_t*>*> pending_rsps_;
+  uint32_t input_idx_;
 };

 }
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
--- a/sim/simx/exeunit.cpp
+++ b/sim/simx/exeunit.cpp
@@ -1,383 +0,0 @@
-#include "exeunit.h"
-#include <iostream>
-#include <iomanip>
-#include <string.h>
-#include <assert.h>
-#include <util.h>
-#include "debug.h"
-#include "core.h"
-#include "constants.h"
-
-using namespace vortex;
-
-NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
-    
-void NopUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    Output.send(trace, 1);
-    Input.pop();
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
-    : ExeUnit(ctx, core, "LSU")
-    , num_threads_(core->arch().num_threads()) 
-    , pending_rd_reqs_(LSUQ_SIZE)
-    , fence_lock_(false)
-{}
-
-void LsuUnit::reset() {
-    pending_rd_reqs_.clear();
-    fence_lock_ = false;
-}
-
-void LsuUnit::tick() {
-    // handle dcache response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
-        if (dcache_rsp_port.empty())
-            continue;
-        auto& mem_rsp = dcache_rsp_port.front();
-        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
-        auto trace = entry.first;
-        DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
-            << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_rd_reqs_.release(mem_rsp.tag);
-        } 
-        dcache_rsp_port.pop();  
-    }
-
-    // handle shared memory response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
-        if (smem_rsp_port.empty())
-            continue;
-        auto& mem_rsp = smem_rsp_port.front();
-        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
-        auto trace = entry.first;
-        DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
-            << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_rd_reqs_.release(mem_rsp.tag);
-        } 
-        smem_rsp_port.pop();  
-    }
-
-    if (fence_lock_) {
-        // wait for all pending memory operations to complete
-        if (!pending_rd_reqs_.empty())
-            return;
-        Output.send(fence_state_, 1);
-        fence_lock_ = false;
-        DT(3, "fence-unlock: " << fence_state_);
-    }
-
-    // check input queue
-    if (Input.empty())
-        return;
-
-    auto trace = Input.front();
-
-    if (trace->lsu.type == LsuType::FENCE) {
-        // schedule fence lock
-        fence_state_ = trace;
-        fence_lock_ = true;        
-        DT(3, "fence-lock: " << *trace);
-        // remove input
-        auto time = Input.pop(); 
-        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
-        return;
-    }
-
-    // check pending queue capacity    
-    if (pending_rd_reqs_.full()) {
-        if (!trace->suspend()) {
-            DT(3, "*** lsu-queue-stall: " << *trace);
-        }
-        return;
-    } else {
-        trace->resume();
-    }
-    
-    bool is_write = (trace->lsu.type == LsuType::STORE);
-
-    // duplicates detection
-    bool is_dup = false;
-    if (trace->tmask.test(0)) {
-        uint64_t addr_mask = sizeof(uint32_t)-1;
-        uint32_t addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
-        uint32_t matches = 1;
-        for (uint32_t t = 1; t < num_threads_; ++t) {
-            if (!trace->tmask.test(t))
-                continue;
-            auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
-            matches += (addr0 == mem_addr);
-        }
-        is_dup = (matches == trace->tmask.count());
-    }
-
-    uint32_t valid_addrs = 0;
-    if (is_dup) {
-        valid_addrs = 1;
-    } else {
-        for (auto& mem_addr : trace->mem_addrs) {
-            valid_addrs += mem_addr.size();
-        }
-    }
-
-    auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
-
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        if (!trace->tmask.test(t))
-            continue;
-        
-        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);        
-        auto mem_addr = trace->mem_addrs.at(t).at(0);
-        auto type = get_addr_type(mem_addr.addr, mem_addr.size);
-
-        MemReq mem_req;
-        mem_req.addr  = mem_addr.addr;
-        mem_req.write = is_write;
-        mem_req.non_cacheable = (type == AddrType::IO); 
-        mem_req.tag   = tag;
-        mem_req.core_id = trace->cid;
-        mem_req.uuid = trace->uuid;
-        
-        if (type == AddrType::Shared) {
-            core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
-            DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
-        } else {            
-            dcache_req_port.send(mem_req, 2);
-            DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
-        }        
-        
-        if (is_dup)
-            break;
-    }
-
-    // do not wait on writes
-    if (is_write) {        
-        pending_rd_reqs_.release(tag);
-        Output.send(trace, 1);
-    }
-
-    // remove input
-    auto time = Input.pop();
-    core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
-    
-void AluUnit::tick() {    
-    if (Input.empty())
-        return;
-    auto trace = Input.front();    
-    switch (trace->alu.type) {
-    case AluType::ARITH:        
-    case AluType::BRANCH:
-    case AluType::SYSCALL:
-    case AluType::CMOV:
-        Output.send(trace, 1);
-        break;
-    case AluType::IMUL:
-        Output.send(trace, LATENCY_IMUL+1);
-        break;
-    case AluType::IDIV:
-        Output.send(trace, XLEN+1);
-        break;
-    default:
-        std::abort();
-    }
-    DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
-    if (trace->fetch_stall) {
-        core_->stalled_warps_.reset(trace->wid);
-    }
-    auto time = Input.pop();
-    core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
-    
-void CsrUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    Output.send(trace, 1);
-    auto time = Input.pop();
-    core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
-    DT(3, "pipeline-execute: op=CSR, " << *trace);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
-    
-void FpuUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    switch (trace->fpu.type) {
-    case FpuType::FNCP:
-        Output.send(trace, 2);
-        break;
-    case FpuType::FMA:
-        Output.send(trace, LATENCY_FMA+1);
-        break;
-    case FpuType::FDIV:
-        Output.send(trace, LATENCY_FDIV+1);
-        break;
-    case FpuType::FSQRT:
-        Output.send(trace, LATENCY_FSQRT+1);
-        break;
-    case FpuType::FCVT:
-        Output.send(trace, LATENCY_FCVT+1);
-        break;
-    default:
-        std::abort();
-    }    
-    DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
-    auto time = Input.pop();
-    core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-GpuUnit::GpuUnit(const SimContext& ctx, Core* core) 
-    : ExeUnit(ctx, core, "GPU")
-    , num_threads_(core->arch().num_threads()) 
-    , pending_tex_reqs_(TEXQ_SIZE)
-{}
-
-void GpuUnit::reset() {
-    pending_tex_reqs_.clear();
-}
-    
-void GpuUnit::tick() {
-#ifdef EXT_TEX_ENABLE
-    // handle memory response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
-        if (dcache_rsp_port.empty())
-            continue;
-        auto& mem_rsp = dcache_rsp_port.front();
-        auto& entry = pending_tex_reqs_.at(mem_rsp.tag);  
-        auto trace = entry.first;
-        DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_tex_reqs_.release(mem_rsp.tag);
-        }   
-        dcache_rsp_port.pop();
-    }
-#endif
-
-    // check input queue
-    if (Input.empty())
-        return;
-
-    auto trace = Input.front();
-
-    bool issued = false;
-
-    switch  (trace->gpu.type) {
-    case GpuType::TMC:
-        Output.send(trace, 1);
-        core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
-        issued = true;
-        break;
-    case GpuType::WSPAWN:
-        Output.send(trace, 1);
-        core_->active_warps_ = trace->gpu.active_warps;        
-        issued = true;
-        break;
-    case GpuType::SPLIT:
-    case GpuType::JOIN:
-        Output.send(trace, 1);
-        issued = true;
-        break;
-    case GpuType::BAR:
-        Output.send(trace, 1);
-        if (trace->gpu.active_warps != 0) 
-            core_->active_warps_ |= trace->gpu.active_warps;
-        else
-            core_->active_warps_.reset(trace->wid);
-        issued = true;
-        break;
-    case GpuType::TEX:
-        if (this->processTexRequest(trace))
-           issued = true;
-        break;
-    default:
-        std::abort();
-    }
-
-    if (issued) {    
-        DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
-        if (trace->fetch_stall)  {
-            core_->stalled_warps_.reset(trace->wid);
-        }
-        auto time = Input.pop();
-        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
-    }
-}
-
-bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {    
-    // check pending queue capacity    
-    if (pending_tex_reqs_.full()) {
-        if (!trace->suspend()) {
-            DT(3, "*** tex-queue-stall: " << *trace);
-        }
-        return false;
-    } else {
-        trace->resume();
-    }
-
-    // send memory request
-
-    uint32_t valid_addrs = 0;
-    for (auto& mem_addr : trace->mem_addrs) {
-        valid_addrs += mem_addr.size();
-    }
-
-    auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
-
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        if (!trace->tmask.test(t))
-            continue;
-
-        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
-        for (auto& mem_addr : trace->mem_addrs.at(t)) {
-            MemReq mem_req;
-            mem_req.addr  = mem_addr.addr;
-            mem_req.write = (trace->lsu.type == LsuType::STORE);
-            mem_req.tag   = tag;
-            mem_req.core_id = core_->id();
-            mem_req.uuid = trace->uuid;
-            dcache_req_port.send(mem_req, 3);
-            DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", tid=" << t << ", "<< trace);
-            ++ core_->perf_stats_.tex_reads;
-            ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
-        }
-    }
-
-    return true;
-}
--- a/sim/simx/ibuffer.h
+++ b/sim/simx/ibuffer.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "pipeline.h"
@@ -6,10 +19,6 @@
 namespace vortex {

 class IBuffer {
-private:
-    std::queue<pipeline_trace_t*> entries_;
-    uint32_t capacity_;
-
 public:    
    IBuffer(uint32_t size) 
        : capacity_(size)
@@ -39,6 +48,10 @@ public:
        std::queue<pipeline_trace_t*> empty;
        std::swap(entries_, empty );
    }
+
+private:
+    std::queue<pipeline_trace_t*> entries_;
+    uint32_t capacity_;
 };

 }
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "types.h"
@@ -7,7 +20,7 @@ namespace vortex {
 class Warp;

 enum Opcode {   
-  NOP       = 0,    
+  NONE      = 0,    
  R_INST    = 0x33,
  L_INST    = 0x3,
  I_INST    = 0x13,
@@ -19,6 +32,7 @@ enum Opcode {
  JALR_INST = 0x67,
  SYS_INST  = 0x73,
  FENCE     = 0x0f,
+  AMO       = 0x2f,
  // F Extension
  FL        = 0x7,
  FS        = 0x27,
@@ -26,19 +40,20 @@ enum Opcode {
  FMADD     = 0x43,
  FMSUB     = 0x47,
  FMNMSUB   = 0x4b,
-  FMNMADD   = 0x4f,
-  // Vector Extension  
-  VSET      = 0x57,
-  // GPGPU Extension
-  GPGPU     = 0x6b,
-  GPU       = 0x5b,
-  // RV64 Standard Extensions
+  FMNMADD   = 0x4f,  
+  // RV64 Standard Extension
  R_INST_W  = 0x3b,
  I_INST_W  = 0x1b,
+  // Vector Extension  
+  VSET      = 0x57,
+  // Custom Extensions
+  EXT1      = 0x0b,
+  EXT2      = 0x2b,
+  EXT3      = 0x5b,
+  EXT4      = 0x7b
 };

-enum InstType { 
-  N_TYPE, 
+enum InstType {
  R_TYPE, 
  I_TYPE, 
  S_TYPE, 
@@ -52,25 +67,45 @@ enum InstType {
 class Instr {
 public:
  Instr() 
-    : opcode_(Opcode::NOP)
+    : opcode_(Opcode::NONE)
    , num_rsrcs_(0)
    , has_imm_(false)
    , rdest_type_(RegType::None)
+    , imm_(0)
    , rdest_(0)
    , func2_(0)
    , func3_(0)
    , func6_(0)
-    , func7_(0) {
+    , func7_(0)
+    , vmask_(0)
+    , vlsWidth_(0)
+    , vMop_(0)
+    , vNf_(0)
+    , vs3_(0)
+    , vlmul_(0)
+    , vsew_(0)
+    , vediv_(0)   {
    for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
       rsrc_type_[i] = RegType::None;
+       rsrc_[i] = 0;
    }
  }

  void setOpcode(Opcode opcode)  { opcode_ = opcode; }
-  void setDestReg(uint32_t destReg, RegType type) { rdest_type_ = type; rdest_ = destReg; }
-  void setSrcReg(uint32_t srcReg, RegType type) { rsrc_type_[num_rsrcs_] = type; rsrc_[num_rsrcs_++] = srcReg; }
-  void setDestVReg(uint32_t destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
-  void setSrcVReg(uint32_t srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg;  }
+  void setDestReg(uint32_t destReg, RegType type) { 
+    rdest_type_ = type; 
+    rdest_ = destReg; 
+  }
+  void addSrcReg(uint32_t srcReg, RegType type) { 
+    rsrc_type_[num_rsrcs_] = type; 
+    rsrc_[num_rsrcs_] = srcReg; 
+    ++num_rsrcs_;
+  }
+  void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) { 
+    rsrc_type_[index] = type; 
+    rsrc_[index] = srcReg; 
+    num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1); 
+  }
  void setFunc2(uint32_t func2) { func2_ = func2; }
  void setFunc3(uint32_t func3) { func3_ = func3; }
  void setFunc7(uint32_t func7) { func7_ = func7; }
@@ -85,17 +120,17 @@ public:
  void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; }
  void setFunc6(uint32_t func6) { func6_ = func6; }

-  Opcode getOpcode() const { return opcode_; }
+  Opcode   getOpcode() const { return opcode_; }
  uint32_t getFunc2() const { return func2_; }
  uint32_t getFunc3() const { return func3_; }
  uint32_t getFunc6() const { return func6_; }
  uint32_t getFunc7() const { return func7_; }
  uint32_t getNRSrc() const { return num_rsrcs_; }
  uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
-  RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
+  RegType  getRSType(uint32_t i) const { return rsrc_type_[i]; }
  uint32_t getRDest() const { return rdest_; }  
-  RegType getRDType() const { return rdest_type_; }  
-  bool hasImm() const { return has_imm_; }
+  RegType  getRDType() const { return rdest_type_; }  
+  bool     hasImm() const { return has_imm_; }
  uint32_t getImm() const { return imm_; }
  uint32_t getVlsWidth() const { return vlsWidth_; }
  uint32_t getVmop() const { return vMop_; }
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -1,98 +1,132 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <iomanip>
 #include <string>
 #include <sstream>
 #include <fstream>
 #include <stdlib.h>
+#include <unistd.h>
 #include <sys/stat.h>
 #include "processor.h"
-#include "archdef.h"
 #include "mem.h"
 #include "constants.h"
 #include <util.h>
-#include "args.h"
 #include "core.h"

 using namespace vortex;

+static void show_usage() {
+   std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-r: riscv-test] [-s: stats] [-h: help] <program>" << std::endl;
+}
+
+uint32_t num_threads = NUM_THREADS;
+uint32_t num_warps = NUM_WARPS;
+uint32_t num_cores = NUM_CORES;
+uint32_t num_clusters = NUM_CLUSTERS;
+bool showStats = false;;
+bool riscv_test = false;
+const char* program = nullptr;
+
+static void parse_args(int argc, char **argv) {
+  	int c;
+  	while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
+    	switch (c) {
+      case 't':
+        num_threads = atoi(optarg);
+        break;
+      case 'w':
+        num_warps = atoi(optarg);
+        break;
+		  case 'c':
+        num_cores = atoi(optarg);
+        break;
+		  case 'g':
+        num_clusters = atoi(optarg);
+        break;
+      case 'r':
+        riscv_test = true;
+        break;
+      case 's':
+        showStats = true;
+        break;
+    	case 'h':
+    	case '?':
+      		show_usage();
+      		exit(0);
+    		break;
+    	default:
+      		show_usage();
+      		exit(-1);
+    	}
+	}
+
+	if (optind < argc) {
+		program = argv[optind];
+    std::cout << "Running " << program << "..." << std::endl;
+	} else {
+		show_usage();
+    exit(-1);
+	}
+}
+
 int main(int argc, char **argv) {
  int exitcode = 0;

-  std::string imgFileName;
-  int num_cores(NUM_CORES * NUM_CLUSTERS);
-  int num_warps(NUM_WARPS);
-  int num_threads(NUM_THREADS);  
-  bool showHelp(false);
-  bool showStats(false);
-  bool riscv_test(false);
+  parse_args(argc, argv);

-  // parse the command line arguments
-  CommandLineArgFlag fh("-h", "--help", "show command line options", showHelp);
-  CommandLineArgSetter<std::string> fi("-i", "--image", "program binary", imgFileName);
-  CommandLineArgSetter<int> fc("-c", "--cores", "number of cores", num_cores);
-  CommandLineArgSetter<int> fw("-w", "--warps", "number  of warps", num_warps);
-  CommandLineArgSetter<int> ft("-t", "--threads", "number of threads", num_threads);
-  CommandLineArgFlag fr("-r", "--riscv", "enable riscv tests", riscv_test);
-  CommandLineArgFlag fs("-s", "--stats", "show stats", showStats);
-
-  CommandLineArg::readArgs(argc - 1, argv + 1);
-
-  if (showHelp || imgFileName.empty()) {
-    std::cout << "Vortex emulator command line arguments:\n"
-                 "  -i, --image <filename> Program RAM image\n"
-                 "  -c, --cores <num> Number of cores\n"
-                 "  -w, --warps <num> Number of warps\n"
-                 "  -t, --threads <num> Number of threads\n"
-                 "  -r, --riscv riscv test\n"
-                 "  -s, --stats Print stats on exit.\n";
-    return 0;
-  }
-
-  std::cout << "Running " << imgFileName << "..." << std::endl;
-  
  {
    // create processor configuation
-    ArchDef arch(num_cores, num_warps, num_threads);
+    Arch arch(num_threads, num_warps, num_cores, num_clusters);

    // create memory module
    RAM ram(RAM_PAGE_SIZE);

+    // create processor
+    Processor processor(arch);
+  
+    // attach memory module
+    processor.attach_ram(&ram); 
+
+	  // setup base DCRs
+    const uint64_t startup_addr(STARTUP_ADDR);
+    processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
+  #if (XLEN == 64)
+    processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
+  #endif
+	  processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
+
    // load program
-    {
-      std::string program_ext(fileExtension(imgFileName.c_str()));
+    {      
+      std::string program_ext(fileExtension(program));
      if (program_ext == "bin") {
-        ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
+        ram.loadBinImage(program, startup_addr);
      } else if (program_ext == "hex") {
-        ram.loadHexImage(imgFileName.c_str());
+        ram.loadHexImage(program);
      } else {
        std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
        return -1;
      }
    }

-    // create processor
-    Processor processor(arch);
-  
-    // attach memory module
-    processor.attach_ram(&ram);   
-
    // run simulation
-    exitcode = processor.run();
+    exitcode = processor.run(riscv_test);
+  }   

+  if (exitcode != 0) {
+    std::cout << "*** error: exitcode=" << exitcode << std::endl;
  } 

-  if (riscv_test) {
-    if (1 == exitcode) {
-      std::cout << "Passed." << std::endl;
-      exitcode = 0;
-    } else {
-      std::cout << "Failed." << std::endl;
-    }
-  } else {
-    if (exitcode != 0) {
-      std::cout << "*** error: exitcode=" << exitcode << std::endl;
-    }
-  }  
-
  return exitcode;
 }
--- a/sim/simx/mem_sim.cpp
+++ b/sim/simx/mem_sim.cpp
@@ -1,4 +1,17 @@
-#include "memsim.h"
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mem_sim.h"
 #include <vector>
 #include <queue>
 #include <stdlib.h>
@@ -83,7 +96,7 @@ public:
            mem_req.addr,
            mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
            std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
-            mem_req.core_id
+            mem_req.cid
        );

        if (!dram_->send(dram_req))
--- a/sim/simx/mem_sim.h
+++ b/sim/simx/mem_sim.h
@@ -1,8 +1,20 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
 #include "types.h"
-#include <vector>

 namespace vortex {

--- a/sim/simx/operand.h
+++ b/sim/simx/operand.h
@@ -0,0 +1,61 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pipeline.h"
+#include <queue>
+
+namespace vortex {
+
+class Operand : public SimObject<Operand> {
+public:
+    SimPort<pipeline_trace_t*> Input;
+    SimPort<pipeline_trace_t*> Output;
+
+    Operand(const SimContext& ctx) 
+        : SimObject<Operand>(ctx, "Operand") 
+        , Input(this)
+        , Output(this)
+    {}
+    
+    virtual ~Operand() {}
+
+    virtual void reset() {}
+
+    virtual void tick() {
+        if (Input.empty())
+            return;
+        auto trace = Input.front();
+
+        int delay = 1;
+        for (int i = 0; i < MAX_NUM_REGS; ++i) {
+            bool is_iregs = trace->used_iregs.test(i);
+            bool is_fregs = trace->used_fregs.test(i);
+            bool is_vregs = trace->used_vregs.test(i);
+            if (is_iregs || is_fregs || is_vregs) {
+                if (is_iregs && i == 0)
+                    continue;
+                ++delay;
+            }
+        }
+
+        Output.send(trace, delay);
+        
+        DT(3, "pipeline-operands: " << *trace);
+
+        Input.pop();
+    };
+};
+
+}
--- a/sim/simx/pipeline.h
+++ b/sim/simx/pipeline.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+

 #pragma once

@@ -5,14 +18,38 @@
 #include <iostream>
 #include <util.h>
 #include "types.h"
-#include "archdef.h"
+#include "arch.h"
 #include "debug.h"

 namespace vortex {

+class ITraceData {
+public:
+    using Ptr = std::shared_ptr<ITraceData>;
+    ITraceData() {}
+    virtual ~ITraceData() {}
+};
+
+struct LsuTraceData : public ITraceData {
+  using Ptr = std::shared_ptr<LsuTraceData>;
+  std::vector<mem_addr_size_t> mem_addrs;
+  LsuTraceData(uint32_t num_threads) : mem_addrs(num_threads) {}
+};
+
+struct SFUTraceData : public ITraceData {
+  using Ptr = std::shared_ptr<SFUTraceData>;
+  struct {
+    uint32_t id;
+    uint32_t count;
+  } bar;
+  SFUTraceData(uint32_t bar_id, uint32_t bar_count) : bar{bar_id, bar_count} {}
+};
+
 struct pipeline_trace_t {
+public:
  //--
-  uint64_t    uuid;
+  const uint64_t uuid;
+  const Arch&    arch;
  
  //--
  uint32_t    cid;
@@ -21,12 +58,9 @@ struct pipeline_trace_t {
  Word        PC;

  //--
-  bool        fetch_stall;
-
-  //--
-  bool        wb;  
-  RegType     rdest_type;
  uint32_t    rdest;
+  RegType     rdest_type;
+  bool        wb;

  //--
  RegMask     used_iregs;
@@ -36,73 +70,104 @@ struct pipeline_trace_t {
  //- 
  ExeType     exe_type; 

-  //--
-  std::vector<std::vector<mem_addr_size_t>> mem_addrs;
-  
  //--
  union {
-    struct {        
-      LsuType type;
-    } lsu;
-    struct {
-      AluType type;
-    } alu;
-    struct {
-      FpuType type;
-    } fpu;
-    struct {
-      GpuType type;
-      WarpMask active_warps;
-    } gpu;
+    uint32_t unit_type;
+    LsuType  lsu_type;
+    AluType  alu_type;
+    FpuType  fpu_type;
+    SfuType  sfu_type;
  };

-  bool stalled;
+  ITraceData::Ptr data;

-  pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
-    uuid = uuid_;
-    cid = 0;
-    wid = 0;
-    tmask.reset();
-    PC = 0;
-    fetch_stall = false;
-    wb  = false;
-    rdest = 0;
-    rdest_type = RegType::None;
-    used_iregs.reset();
-    used_fregs.reset();
-    used_vregs.reset();
-    exe_type = ExeType::NOP;
-    mem_addrs.resize(arch.num_threads());
-    stalled = false;
-  }
+  int pid;
+  bool sop;
+  bool eop;

-  bool suspend() {
-    bool old = stalled;
-    stalled = true;
+  bool fetch_stall;
+
+  pipeline_trace_t(uint64_t uuid, const Arch& arch) 
+    : uuid(uuid)
+    , arch(arch)
+    , cid(0)
+    , wid(0)
+    , tmask(0)
+    , PC(0)    
+    , rdest(0)
+    , rdest_type(RegType::None)
+    , wb(false)
+    , used_iregs(0)
+    , used_fregs(0)
+    , used_vregs(0)
+    , exe_type(ExeType::ALU)
+    , unit_type(0)
+    , data(nullptr)
+    , pid(-1)
+    , sop(true)
+    , eop(true)
+    , fetch_stall(false)
+    , log_once_(false) 
+  {}
+
+  pipeline_trace_t(const pipeline_trace_t& rhs) 
+    : uuid(rhs.uuid)
+    , arch(rhs.arch)
+    , cid(rhs.cid)
+    , wid(rhs.wid)
+    , tmask(rhs.tmask)
+    , PC(rhs.PC)    
+    , rdest(rhs.rdest)
+    , rdest_type(rhs.rdest_type)
+    , wb(rhs.wb)    
+    , used_iregs(rhs.used_iregs)
+    , used_fregs(rhs.used_fregs)
+    , used_vregs(rhs.used_vregs)
+    , exe_type(rhs.exe_type)
+    , unit_type(rhs.unit_type)
+    , data(rhs.data)
+    , pid(rhs.pid)
+    , sop(rhs.sop)
+    , eop(rhs.eop)
+    , fetch_stall(rhs.fetch_stall)
+    , log_once_(false) 
+  {}
+  
+  ~pipeline_trace_t() {}
+
+  bool log_once(bool enable) {
+    bool old = log_once_;
+    log_once_ = enable;
    return old;
  }

-  void resume() {
-    stalled = false;
-  }
+private:
+  bool log_once_;
 };

 inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
-  os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
+  os << "cid=" << state.cid;
+  os << ", wid=" << state.wid;
+  os << ", tmask=";
+  for (uint32_t i = 0, n = state.arch.num_threads(); i < n; ++i) {
+      os << state.tmask.test(i);
+  }  
+  os << ", PC=0x" << std::hex << state.PC;
  os << ", wb=" << state.wb;
  if (state.wb) {
     os << ", rd=" << state.rdest_type << std::dec << state.rdest;
  }
  os << ", ex=" << state.exe_type;
+  if (state.pid != -1) {
+    os << ", pid=" << state.pid;
+    os << ", sop=" << state.sop;
+    os << ", eop=" << state.eop;
+  }
  os << " (#" << std::dec << state.uuid << ")";
  return os;
 }

 class PipelineLatch {
-protected:
-  const char* name_;
-  std::queue<pipeline_trace_t*> queue_;
-
 public:
  PipelineLatch(const char* name = nullptr) 
    : name_(name) 
@@ -132,6 +197,10 @@ public:
    std::queue<pipeline_trace_t*> empty;
    std::swap(queue_, empty );
  }
+
+protected:
+  const char* name_;
+  std::queue<pipeline_trace_t*> queue_;
 };

 }
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -1,168 +1,141 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "processor.h"
-#include "core.h"
-#include "constants.h"
+#include "processor_impl.h"

 using namespace vortex;

-class Processor::Impl {
-private:
-  std::vector<Core::Ptr> cores_;
-  std::vector<Cache::Ptr> l2caches_;
-  std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
-  Cache::Ptr l3cache_;
-  Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
+ProcessorImpl::ProcessorImpl(const Arch& arch) 
+  : arch_(arch)
+  , clusters_(arch.num_clusters())
+{
+  SimPlatform::instance().initialize();

-public:
-  Impl(const ArchDef& arch) 
-    : cores_(arch.num_cores())
-    , l2caches_(NUM_CLUSTERS)
-    , l2_mem_switches_(NUM_CLUSTERS)
-  {
-    SimPlatform::instance().initialize();
+  // create memory simulator
+  memsim_ = MemSim::Create("dram", MemSim::Config{
+    MEMORY_BANKS,
+    uint32_t(arch.num_cores()) * arch.num_clusters()
+  });

-    uint32_t num_cores = arch.num_cores();
-    uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
-
-    // create cores
-    for (uint32_t i = 0; i < num_cores; ++i) {
-        cores_.at(i) = Core::Create(arch, i);
+  // create L3 cache
+  l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
+    !L3_ENABLED,
+    log2ceil(L3_CACHE_SIZE),  // C
+    log2ceil(MEM_BLOCK_SIZE), // B
+    log2ceil(L3_NUM_WAYS),  // W
+    0,                      // A
+    XLEN,                   // address bits  
+    L3_NUM_BANKS,           // number of banks
+    1,                      // number of ports
+    uint8_t(arch.num_clusters()), // request size 
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    L3_MSHR_SIZE,           // mshr
+    2,                      // pipeline latency
    }
+  );        
+  
+  // connect L3 memory ports
+  l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
+  memsim_->MemRspPort.bind(&l3cache_->MemRspPort);

-     // setup memory simulator
-    auto memsim = MemSim::Create("dram", MemSim::Config{
-      MEMORY_BANKS,
-      arch.num_cores()
-    });
-    
-    std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
-    std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
-
-    if (L3_ENABLE) {
-      l3cache_ = Cache::Create("l3cache", Cache::Config{
-        log2ceil(L3_CACHE_SIZE),  // C
-        log2ceil(MEM_BLOCK_SIZE), // B
-        2,                      // W
-        0,                      // A
-        32,                     // address bits  
-        L3_NUM_BANKS,           // number of banks
-        L3_NUM_PORTS,           // number of ports
-        NUM_CLUSTERS,           // request size 
-        true,                   // write-through
-        false,                  // write response
-        0,                      // victim size
-        L3_MSHR_SIZE,           // mshr
-        2,                      // pipeline latency
-        }
-      );        
-      l3cache_->MemReqPort.bind(mem_req_ports.at(0));
-      mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
-        mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
-      }
-    } else if (NUM_CLUSTERS > 1) {
-      l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
-      l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
-      mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
-        mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
-      }
-    }
-
-    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-      std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster); 
-      std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
-
-      if (L2_ENABLE) {
-        auto& l2cache = l2caches_.at(i);
-        l2cache = Cache::Create("l2cache", Cache::Config{
-          log2ceil(L2_CACHE_SIZE),  // C
-          log2ceil(MEM_BLOCK_SIZE), // B
-          2,                      // W
-          0,                      // A
-          32,                     // address bits  
-          L2_NUM_BANKS,           // number of banks
-          L2_NUM_PORTS,           // number of ports
-          (uint8_t)cores_per_cluster, // request size 
-          true,                   // write-through
-          false,                  // write response
-          0,                      // victim size
-          L2_MSHR_SIZE,           // mshr
-          2,                      // pipeline latency
-        });
-        l2cache->MemReqPort.bind(mem_req_ports.at(i));
-        mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
-
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
-          cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
-        }
-      } else {
-        auto& l2_mem_switch = l2_mem_switches_.at(i);
-        l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
-        l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
-        mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
-
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
-          cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
-        }
-      }
-
-      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-        auto& core = cores_.at((i * cores_per_cluster) + j);
-        core->MemReqPort.bind(cluster_mem_req_ports.at(j));
-        cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
-      }
-    }
+  // create clusters
+  for (uint32_t i = 0; i < arch.num_clusters(); ++i) {
+    clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_);
+    // connect L3 core ports
+    clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i));
+    l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port);
  }

-  ~Impl() {
-    SimPlatform::instance().finalize();
-  }
+  // set up memory perf recording
+  memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
+    __unused (cycle);
+    perf_mem_reads_   += !req.write;
+    perf_mem_writes_  += req.write;
+    perf_mem_pending_reads_ += !req.write;
+  });
+  memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
+    __unused (cycle);
+    --perf_mem_pending_reads_;
+  });

-  void attach_ram(RAM* ram) {
-    for (auto core : cores_) {
-      core->attach_ram(ram);
-    }
-  }
+  this->reset();
+}

-  int run() {
-    SimPlatform::instance().reset();
-    bool running;
-    int exitcode = 0;
-    do {
-      SimPlatform::instance().tick();
-      running = false;
-      for (auto& core : cores_) {
-        if (core->running()) {
-          running = true;
-        }
-        if (core->check_exit()) {
-          exitcode = core->getIRegValue(3);
-          running = false;
-          break;
+ProcessorImpl::~ProcessorImpl() {
+  SimPlatform::instance().finalize();
+}
+
+void ProcessorImpl::attach_ram(RAM* ram) {
+  for (auto cluster : clusters_) {
+    cluster->attach_ram(ram);
+  }
+}
+
+int ProcessorImpl::run(bool riscv_test) {
+  SimPlatform::instance().reset();
+  this->reset();
+  
+  bool done;
+  Word exitcode = 0;
+  do {
+    SimPlatform::instance().tick();
+    done = true;
+    for (auto cluster : clusters_) {
+      if (cluster->running()) {
+        Word ec;   
+        if (cluster->check_exit(&ec, riscv_test)) {
+          exitcode |= ec;
+        } else {
+          done = false;
        }
      }
-    } while (running);
+    }
+    perf_mem_latency_ += perf_mem_pending_reads_;
+  } while (!done);

-    return exitcode;
-  }
-};
+  return exitcode;
+}
+ 
+void ProcessorImpl::reset() {
+  perf_mem_reads_ = 0;
+  perf_mem_writes_ = 0;
+  perf_mem_latency_ = 0;
+  perf_mem_pending_reads_ = 0;
+}
+
+void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
+  dcrs_.write(addr, value);
+}
+
+ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
+  ProcessorImpl::PerfStats perf;
+  perf.mem_reads   = perf_mem_reads_;
+  perf.mem_writes  = perf_mem_writes_;
+  perf.mem_latency = perf_mem_latency_;
+  perf.l3cache     = l3cache_->perf_stats();
+  for (auto cluster : clusters_) {
+    perf.clusters += cluster->perf_stats();
+  }   
+  return perf;
+}

 ///////////////////////////////////////////////////////////////////////////////

-Processor::Processor(const ArchDef& arch) 
-  : impl_(new Impl(arch))
+Processor::Processor(const Arch& arch) 
+  : impl_(new ProcessorImpl(arch))
 {}

 Processor::~Processor() {
@@ -173,6 +146,10 @@ void Processor::attach_ram(RAM* mem) {
  impl_->attach_ram(mem);
 }

-int Processor::run() {
-  return impl_->run();
+int Processor::run(bool riscv_test) {
+  return impl_->run(riscv_test);
+}
+
+void Processor::write_dcr(uint32_t addr, uint32_t value) {
+  return impl_->write_dcr(addr, value);
 }
--- a/sim/simx/processor.h
+++ b/sim/simx/processor.h
@@ -1,22 +1,39 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

+#include <stdint.h>
+
 namespace vortex {

-class ArchDef;
+class Arch;
 class RAM;
+class ProcessorImpl;

 class Processor {
 public:
-  Processor(const ArchDef& arch);
+  Processor(const Arch& arch);
  ~Processor();

  void attach_ram(RAM* mem);

-  int run();
+  int run(bool riscv_test);
+
+  void write_dcr(uint32_t addr, uint32_t value);

 private:
-  class Impl;
-  Impl* impl_;
+  ProcessorImpl* impl_;
 };

-}
+}
--- a/sim/simx/processor_impl.h
+++ b/sim/simx/processor_impl.h
@@ -0,0 +1,66 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mem_sim.h"
+#include "cache_sim.h"
+#include "constants.h"
+#include "dcrs.h"
+#include "cluster.h"
+
+namespace vortex {
+
+class ProcessorImpl {
+public:
+  struct PerfStats {
+    uint64_t mem_reads;
+    uint64_t mem_writes;
+    uint64_t mem_latency;
+    CacheSim::PerfStats l3cache;
+    Cluster::PerfStats clusters;
+
+    PerfStats()
+      : mem_reads(0)
+      , mem_writes(0)
+      , mem_latency(0)
+    {}
+  };
+
+  ProcessorImpl(const Arch& arch);
+  ~ProcessorImpl();
+
+  void attach_ram(RAM* mem);
+
+  int run(bool riscv_test);
+
+  void write_dcr(uint32_t addr, uint32_t value);
+
+  ProcessorImpl::PerfStats perf_stats() const;
+
+private:
+ 
+  void reset();
+
+  const Arch& arch_;
+  std::vector<std::shared_ptr<Cluster>> clusters_;
+  DCRS dcrs_;
+  MemSim::Ptr   memsim_;
+  CacheSim::Ptr l3cache_;
+  uint64_t perf_mem_reads_;
+  uint64_t perf_mem_writes_;
+  uint64_t perf_mem_latency_;
+  uint64_t perf_mem_pending_reads_;
+};
+
+}
--- a/sim/simx/scoreboard.h
+++ b/sim/simx/scoreboard.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "pipeline.h"
@@ -6,20 +19,15 @@
 namespace vortex {

 class Scoreboard {
-private:
+public:
+
    struct reg_use_t {
        RegType  type;
        uint32_t reg;        
        uint64_t owner;
    };
-
-    std::vector<RegMask> in_use_iregs_;
-    std::vector<RegMask> in_use_fregs_;
-    std::vector<RegMask> in_use_vregs_;
-    std::unordered_map<uint32_t, uint64_t> owners_; 
-
-public:    
-    Scoreboard(const ArchDef &arch) 
+        
+    Scoreboard(const Arch &arch) 
        : in_use_iregs_(arch.num_warps())
        , in_use_fregs_(arch.num_warps())
        , in_use_vregs_(arch.num_warps())
@@ -84,8 +92,7 @@ public:
    }
    
    void reserve(pipeline_trace_t* state) {
-        if (!state->wb)
-            return;  
+        assert(state->wb);  
        switch (state->rdest_type) {
        case RegType::Integer:            
            in_use_iregs_.at(state->wid).set(state->rdest);
@@ -105,8 +112,7 @@ public:
    }

    void release(pipeline_trace_t* state) {
-        if (!state->wb)
-            return;       
+        assert(state->wb);      
        switch (state->rdest_type) {
        case RegType::Integer:
            in_use_iregs_.at(state->wid).reset(state->rdest);
@@ -123,6 +129,13 @@ public:
        uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
        owners_.erase(tag);
    }
+
+private:
+
+    std::vector<RegMask> in_use_iregs_;
+    std::vector<RegMask> in_use_fregs_;
+    std::vector<RegMask> in_use_vregs_;
+    std::unordered_map<uint32_t, uint64_t> owners_;
 };

 }
--- a/sim/simx/shared_mem.cpp
+++ b/sim/simx/shared_mem.cpp
@@ -0,0 +1,138 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "shared_mem.h"
+#include "core.h"
+#include <bitmanip.h>
+#include <vector>
+#include "types.h"
+
+using namespace vortex;
+
+class SharedMem::Impl {
+protected:
+    SharedMem* simobject_;
+    Config    config_;
+    RAM       ram_;
+    uint32_t  bank_sel_addr_start_;
+    uint32_t  bank_sel_addr_end_;
+    PerfStats perf_stats_;
+
+    uint64_t to_local_addr(uint64_t addr) {
+        uint32_t total_lines = config_.capacity / config_.line_size;        
+        uint32_t line_bits = log2ceil(total_lines);
+        uint32_t offset = bit_getw(addr, 0, line_bits-1);
+        return offset;
+    }
+
+public:
+    Impl(SharedMem* simobject, const Config& config) 
+        : simobject_(simobject)
+        , config_(config)
+        , ram_(config.capacity, config.capacity)
+        , bank_sel_addr_start_(0)
+        , bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
+    {}    
+    
+    virtual ~Impl() {}
+
+    void reset() {
+        perf_stats_ = PerfStats();
+    }
+
+    void read(void* data, uint64_t addr, uint32_t size) {
+        auto s_addr = to_local_addr(addr);        
+        DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
+        ram_.read(data, s_addr, size);
+    }
+
+    void write(const void* data, uint64_t addr, uint32_t size) {
+        auto s_addr = to_local_addr(addr);        
+        DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
+        ram_.write(data, s_addr, size);
+    }
+
+    void tick() {
+        std::vector<bool> in_used_banks(config_.num_banks);
+        for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
+            auto& core_req_port = simobject_->Inputs.at(req_id);            
+            if (core_req_port.empty())
+                continue;
+
+            auto& core_req = core_req_port.front();
+
+            uint32_t bank_id = 0;
+            if (bank_sel_addr_start_ <= bank_sel_addr_end_) {
+                bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
+            }
+
+            // bank conflict check
+            if (in_used_banks.at(bank_id)) {
+                ++perf_stats_.bank_stalls;
+                continue;
+            }
+
+            in_used_banks.at(bank_id) = true;
+
+            if (!core_req.write || config_.write_reponse) {
+                // send response
+                MemRsp core_rsp{core_req.tag, core_req.cid};
+                simobject_->Outputs.at(req_id).send(core_rsp, 1);
+            }
+
+            // update perf counters
+            perf_stats_.reads += !core_req.write;            
+            perf_stats_.writes += core_req.write;
+
+            // remove input
+            core_req_port.pop();
+        }
+    }
+
+    const PerfStats& perf_stats() const { 
+        return perf_stats_; 
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+SharedMem::SharedMem(const SimContext& ctx, const char* name, const Config& config) 
+    : SimObject<SharedMem>(ctx, name)   
+    , Inputs(config.num_reqs, this)
+    , Outputs(config.num_reqs, this)
+    , impl_(new Impl(this, config))
+{}
+
+SharedMem::~SharedMem() {
+    delete impl_;
+}
+
+void SharedMem::reset() {
+    impl_->reset();
+}
+
+void SharedMem::read(void* data, uint64_t addr, uint32_t size) {
+    impl_->read(data, addr, size);
+}
+
+void SharedMem::write(const void* data, uint64_t addr, uint32_t size) {
+    impl_->write(data, addr, size);
+}
+
+void SharedMem::tick() {
+    impl_->tick();
+}
+
+const SharedMem::PerfStats& SharedMem::perf_stats() const {
+    return impl_->perf_stats();
+}
--- a/sim/simx/shared_mem.h
+++ b/sim/simx/shared_mem.h
@@ -0,0 +1,72 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "types.h"
+
+namespace vortex {
+
+class SharedMem : public SimObject<SharedMem> {
+public:
+  struct Config {
+    uint32_t capacity;
+    uint32_t line_size;
+    uint32_t num_reqs;
+    uint32_t num_banks;
+    bool write_reponse;
+  };
+
+  struct PerfStats {
+    uint64_t reads;
+    uint64_t writes;
+    uint64_t bank_stalls;
+
+    PerfStats() 
+      : reads(0)
+      , writes(0)
+      , bank_stalls(0)
+    {}
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->reads += rhs.reads;
+      this->writes += rhs.writes;
+      this->bank_stalls += rhs.bank_stalls;
+      return *this;
+    }
+  };
+
+  std::vector<SimPort<MemReq>> Inputs;
+  std::vector<SimPort<MemRsp>> Outputs;
+
+  SharedMem(const SimContext& ctx, const char* name, const Config& config);    
+  virtual ~SharedMem();
+
+  void reset();
+
+  void read(void* data, uint64_t addr, uint32_t size);
+
+  void write(const void* data, uint64_t addr, uint32_t size);
+
+  void tick();
+
+  const PerfStats& perf_stats() const;
+
+protected:
+
+  class Impl;
+  Impl* impl_;
+};
+
+}
--- a/sim/simx/sharedmem.h
+++ b/sim/simx/sharedmem.h
@@ -1,96 +0,0 @@
-#pragma once
-
-#include <simobject.h>
-#include <bitmanip.h>
-#include <vector>
-#include "types.h"
-
-namespace vortex {
-
-class Core;
-
-class SharedMem : public SimObject<SharedMem> {
-public:
-    struct Config {
-        uint32_t num_reqs;
-        uint32_t num_banks; 
-        uint32_t bank_offset;
-        uint32_t latency;
-        bool     write_reponse;
-    };
-
-    struct PerfStats {
-        uint64_t reads;
-        uint64_t writes;
-        uint64_t bank_stalls;
-
-        PerfStats() 
-            : reads(0)
-            , writes(0)
-            , bank_stalls(0)
-        {}
-    };
-
-    std::vector<SimPort<MemReq>> Inputs;
-    std::vector<SimPort<MemRsp>> Outputs;
-
-    SharedMem(const SimContext& ctx, const char* name, const Config& config) 
-        : SimObject<SharedMem>(ctx, name)
-        , Inputs(config.num_reqs, this)
-        , Outputs(config.num_reqs, this)
-        , config_(config)
-        , bank_sel_addr_start_(config.bank_offset)
-        , bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
-    {}    
-    
-    virtual ~SharedMem() {}
-
-    void reset() {
-        perf_stats_ = PerfStats();
-    }
-
-    void tick() {
-        std::vector<bool> in_used_banks(config_.num_banks);
-        for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
-            auto& core_req_port = this->Inputs.at(req_id);            
-            if (core_req_port.empty())
-                continue;
-
-            auto& core_req = core_req_port.front();
-
-            uint32_t bank_id = (uint32_t)bit_getw(
-                core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
-
-            // bank conflict check
-            if (in_used_banks.at(bank_id))
-                continue;
-
-            in_used_banks.at(bank_id) = true;
-
-            if (!core_req.write || config_.write_reponse) {
-                // send response
-                MemRsp core_rsp{core_req.tag, core_req.core_id};
-                this->Outputs.at(req_id).send(core_rsp, 1);
-            }
-
-            // update perf counters
-            perf_stats_.reads += !core_req.write;            
-            perf_stats_.writes += core_req.write;
-
-            // remove input
-            core_req_port.pop();
-        }
-    }
-
-    const PerfStats& perf_stats() const { 
-        return perf_stats_; 
-    }
-
-protected:
-    Config    config_;
-    uint32_t  bank_sel_addr_start_;
-    uint32_t  bank_sel_addr_end_;
-    PerfStats perf_stats_;
-};
-
-}
--- a/sim/simx/tex_unit.cpp
+++ b/sim/simx/tex_unit.cpp
@@ -1,100 +0,0 @@
-#include "tex_unit.h"
-#include "core.h"
-#include <texturing.h>
-#include <VX_config.h>
-
-using namespace vortex;
-using namespace cocogfx;
-
-enum class FilterMode {
-  Point,
-  Bilinear,
-  Trilinear,
-};
-
-TexUnit::TexUnit(Core* core) : core_(core) {}
-
-TexUnit::~TexUnit() {}
-
-void TexUnit::clear() {
-  for (auto& state : states_) {
-    state = 0;
-  }
-}
-
-uint32_t TexUnit::get_state(uint32_t state) {
-  return states_.at(state);
-}
-  
-void TexUnit::set_state(uint32_t state, uint32_t value) {
-  states_.at(state) = value;
-}
-
-uint32_t TexUnit::read(int32_t u, 
-                       int32_t v, 
-                       int32_t lod, 
-                       std::vector<mem_addr_size_t>* mem_addrs) {
-  //--
-  auto xu = Fixed<TEX_FXD_FRAC>::make(u);
-  auto xv = Fixed<TEX_FXD_FRAC>::make(v);
-  uint32_t base_addr  = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
-  uint32_t log_width  = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
-  uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
-  auto format         = (TexFormat)states_.at(TEX_STATE_FORMAT);    
-  auto filter         = (FilterMode)states_.at(TEX_STATE_FILTER);    
-  auto wrapu          = (WrapMode)states_.at(TEX_STATE_WRAPU);
-  auto wrapv          = (WrapMode)states_.at(TEX_STATE_WRAPV);
-
-  auto stride = Stride(format);
-  
-  switch (filter) {
-  case FilterMode::Bilinear: {
-    // addressing
-    uint32_t offset00, offset01, offset10, offset11;
-    uint32_t alpha, beta;
-    TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, 
-      &offset00, &offset01, &offset10, &offset11, &alpha, &beta);
-
-    uint32_t addr00 = base_addr + offset00 * stride;
-    uint32_t addr01 = base_addr + offset01 * stride;
-    uint32_t addr10 = base_addr + offset10 * stride;
-    uint32_t addr11 = base_addr + offset11 * stride;
-
-    // memory lookup
-    uint32_t texel00(0), texel01(0), texel10(0), texel11(0);
-    core_->dcache_read(&texel00, addr00, stride);
-    core_->dcache_read(&texel01, addr01, stride);
-    core_->dcache_read(&texel10, addr10, stride);
-    core_->dcache_read(&texel11, addr11, stride);
-
-    mem_addrs->push_back({addr00, stride});
-    mem_addrs->push_back({addr01, stride});
-    mem_addrs->push_back({addr10, stride});
-    mem_addrs->push_back({addr11, stride});
-
-    // filtering
-    auto color = TexFilterLinear(
-      format, texel00, texel01, texel10, texel11, alpha, beta);
-    return color;
-  }
-  case FilterMode::Point: {
-    // addressing
-    uint32_t offset;
-    TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
-    
-    uint32_t addr = base_addr + offset * stride;
-
-    // memory lookup
-    uint32_t texel(0);
-    core_->dcache_read(&texel, addr, stride);
-    mem_addrs->push_back({addr, stride});
-
-    // filtering
-    auto color = TexFilterPoint(format, texel);
-    return color;
-  }
-  default:
-    std::abort();
-    return 0;
-  }
-}
--- a/sim/simx/tex_unit.h
+++ b/sim/simx/tex_unit.h
@@ -1,28 +0,0 @@
-#pragma once
-
-#include "types.h"
-
-namespace vortex {
-
-class Core;
-
-class TexUnit {
-public:
-    TexUnit(Core* core);
-    ~TexUnit();
-
-    void clear();
-
-    uint32_t get_state(uint32_t state);
-  
-    void set_state(uint32_t state, uint32_t value);
-
-    uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
-
-private:
-
-    std::array<uint32_t, NUM_TEX_STATES> states_;
-    Core* core_;
-};
-
-}
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <stdint.h>
@@ -5,31 +18,42 @@
 #include <queue>
 #include <unordered_map>
 #include <util.h>
+#include <stringutil.h>
 #include <VX_config.h>
 #include <simobject.h>
+#include "uuid_gen.h"
+#include "debug.h"

 namespace vortex {

 typedef uint8_t Byte;
-#if XLEN == 32
+#if (XLEN == 32)
 typedef uint32_t Word;
 typedef int32_t  WordI;
 typedef uint64_t DWord;
 typedef int64_t  DWordI;
-#elif XLEN == 64
+typedef uint32_t WordF;
+#elif (XLEN == 64)
 typedef uint64_t Word;
 typedef int64_t  WordI;
 typedef __uint128_t DWord;
 typedef __int128_t DWordI;
+typedef uint64_t WordF;
 #else
 #error unsupported XLEN
 #endif

-typedef uint64_t FWord;
+#define MAX_NUM_CORES   1024
+#define MAX_NUM_THREADS 32
+#define MAX_NUM_WARPS   32
+#define MAX_NUM_REGS    32

-typedef std::bitset<32> RegMask;
-typedef std::bitset<32> ThreadMask;
-typedef std::bitset<32> WarpMask;
+typedef std::bitset<MAX_NUM_CORES>   CoreMask;
+typedef std::bitset<MAX_NUM_REGS>    RegMask;
+typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
+typedef std::bitset<MAX_NUM_WARPS>   WarpMask;
+
+typedef std::unordered_map<uint32_t, uint32_t> CSRs;

 ///////////////////////////////////////////////////////////////////////////////

@@ -40,8 +64,8 @@ enum class RegType {
  Vector
 };

-inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
-  switch (clss) {
+inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
+  switch (type) {
  case RegType::None: break;
  case RegType::Integer: os << "x"; break;  
  case RegType::Float:   os << "f"; break;
@@ -53,23 +77,19 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
 ///////////////////////////////////////////////////////////////////////////////

 enum class ExeType {
-  NOP,
  ALU,
  LSU,
-  CSR,
  FPU,
-  GPU,
+  SFU,
  MAX,
 };

 inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
  switch (type) {
-  case ExeType::NOP: os << "NOP"; break;
  case ExeType::ALU: os << "ALU"; break;
  case ExeType::LSU: os << "LSU"; break;
-  case ExeType::CSR: os << "CSR"; break;
  case ExeType::FPU: os << "FPU"; break;
-  case ExeType::GPU: os << "GPU"; break;
+  case ExeType::SFU: os << "SFU"; break;
  case ExeType::MAX: break;
  }
  return os;
@@ -82,8 +102,7 @@ enum class AluType {
  BRANCH,
  SYSCALL,
  IMUL,
-  IDIV,    
-  CMOV,
+  IDIV
 };

 inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -93,7 +112,6 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
  case AluType::SYSCALL: os << "SYSCALL"; break;
  case AluType::IMUL:    os << "IMUL"; break;
  case AluType::IDIV:    os << "IDIV"; break;
-  case AluType::CMOV:    os << "CMOV"; break;
  }
  return os;
 }
@@ -103,16 +121,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
 enum class LsuType {
  LOAD,
  STORE,
-  FENCE,
-  PREFETCH,    
+  FENCE
 };

 inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
  switch (type) {
-  case LsuType::LOAD:     os << "LOAD"; break;
-  case LsuType::STORE:    os << "STORE"; break;
-  case LsuType::FENCE:    os << "FENCE"; break;
-  case LsuType::PREFETCH: os << "PREFETCH"; break;
+  case LsuType::LOAD:  os << "LOAD"; break;
+  case LsuType::STORE: os << "STORE"; break;
+  case LsuType::FENCE: os << "FENCE"; break;
  }
  return os;
 }
@@ -141,21 +157,6 @@ struct mem_addr_size_t {
  uint32_t size;
 };

-inline AddrType get_addr_type(Word addr, uint32_t size) {
-  __unused (size);
-  if (SM_ENABLE) {
-    if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
-    &&  addr < SMEM_BASE_ADDR) {      
-      assert((addr + size) <= SMEM_BASE_ADDR);
-      return AddrType::Shared;
-    }
-  }
-  if (addr >= IO_BASE_ADDR) {
-     return AddrType::IO;
-  }
-  return AddrType::Global;
-}
-
 ///////////////////////////////////////////////////////////////////////////////

 enum class FpuType {
@@ -179,23 +180,37 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {

 ///////////////////////////////////////////////////////////////////////////////

-enum class GpuType {
+enum class SfuType {
  TMC,
  WSPAWN,
  SPLIT,
  JOIN,
  BAR,
+  PRED,
+  CSRRW,
+  CSRRS,
+  CSRRC,
  TEX,
+  RASTER,
+  ROP,    
+  CMOV  
 };

-inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
+inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
  switch (type) {
-  case GpuType::TMC:    os << "TMC"; break;
-  case GpuType::WSPAWN: os << "WSPAWN"; break;
-  case GpuType::SPLIT:  os << "SPLIT"; break;
-  case GpuType::JOIN:   os << "JOIN"; break;
-  case GpuType::BAR:    os << "BAR"; break;
-  case GpuType::TEX:    os << "TEX"; break;
+  case SfuType::TMC:    os << "TMC"; break;
+  case SfuType::WSPAWN: os << "WSPAWN"; break;
+  case SfuType::SPLIT:  os << "SPLIT"; break;
+  case SfuType::JOIN:   os << "JOIN"; break;
+  case SfuType::BAR:    os << "BAR"; break;
+  case SfuType::PRED:   os << "PRED"; break;
+  case SfuType::CSRRW:  os << "CSRRW"; break;
+  case SfuType::CSRRS:  os << "CSRRS"; break;
+  case SfuType::CSRRC:  os << "CSRRC"; break;
+  case SfuType::TEX:    os << "TEX"; break;
+  case SfuType::RASTER: os << "RASTER"; break;
+  case SfuType::ROP:    os << "ROP"; break;
+  case SfuType::CMOV:   os << "CMOV"; break;
  }
  return os;
 }
@@ -218,31 +233,32 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
 ///////////////////////////////////////////////////////////////////////////////

 struct MemReq {
-    uint64_t addr;
-    bool write;
-    bool non_cacheable;
-    uint32_t tag;
-    uint32_t core_id;    
-    uint64_t uuid;
+  uint64_t addr;
+  bool write;
+  AddrType type;
+  uint32_t tag;
+  uint32_t cid;    
+  uint64_t uuid;

-    MemReq(uint64_t _addr = 0, 
-           bool _write = false,
-           bool _non_cacheable = false,
-           uint64_t _tag = 0, 
-           uint32_t _core_id = 0,
-           uint64_t _uuid = 0
-    )   : addr(_addr)
-        , write(_write)
-        , non_cacheable(_non_cacheable)
-        , tag(_tag)
-        , core_id(_core_id)
-        , uuid(_uuid)
-    {}
+  MemReq(uint64_t _addr = 0, 
+          bool _write = false,
+          AddrType _type = AddrType::Global,
+          uint64_t _tag = 0, 
+          uint32_t _cid = 0,
+          uint64_t _uuid = 0
+  ) : addr(_addr)
+    , write(_write)
+    , type(_type)
+    , tag(_tag)
+    , cid(_cid)
+    , uuid(_uuid)
+  {}
 };

 inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
  os << "mem-" << (req.write ? "wr" : "rd") << ": ";
-  os << "addr=" << std::hex << req.addr << std::dec << ", tag=" << req.tag << ", core_id=" << req.core_id;
+  os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
+  os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
  os << " (#" << std::dec << req.uuid << ")";
  return os;
 }
@@ -250,18 +266,19 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
 ///////////////////////////////////////////////////////////////////////////////

 struct MemRsp {
-    uint64_t tag;    
-    uint32_t core_id;
-    uint64_t uuid;
-    MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
-      : tag (_tag) 
-      , core_id(_core_id)
-      , uuid(_uuid)
-    {}
+  uint64_t tag;    
+  uint32_t cid;
+  uint64_t uuid;
+  
+  MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
+    : tag (_tag) 
+    , cid(_cid)
+    , uuid(_uuid)
+  {}
 };

 inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
-  os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
+  os << "mem-rsp: tag=" << rsp.tag << ", cid=" << rsp.cid;
  os << " (#" << std::dec << rsp.uuid << ")";
  return os;
 }
@@ -270,10 +287,6 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {

 template <typename T>
 class HashTable {
-private:
-  std::vector<std::pair<bool, T>> entries_;
-  uint32_t size_;
-
 public:    
  HashTable(uint32_t capacity)
    : entries_(capacity)
@@ -336,92 +349,180 @@ public:
    }
    size_ = 0;
  }
+
+private:
+  std::vector<std::pair<bool, T>> entries_;
+  uint32_t size_;
 };

 ///////////////////////////////////////////////////////////////////////////////

-template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
+template <typename Req, typename Rsp>
 class Switch : public SimObject<Switch<Req, Rsp>> {
-private:
-  ArbiterType type_;
-  uint32_t delay_;  
-  uint32_t cursor_;
-  uint32_t tag_shift_;
-
 public:
+  std::vector<SimPort<Req>>  ReqIn;
+  std::vector<SimPort<Rsp>>  RspIn;
+
+  std::vector<SimPort<Req>>  ReqOut;  
+  std::vector<SimPort<Rsp>>  RspOut;
+
  Switch(
    const SimContext& ctx, 
    const char* name, 
    ArbiterType type, 
-    uint32_t num_inputs, 
+    uint32_t num_inputs = 1, 
+    uint32_t num_outputs = 1,
    uint32_t delay = 1
  ) 
-    : SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)    
+    : SimObject<Switch<Req, Rsp>>(ctx, name)    
+    , ReqIn(num_inputs,   this)
+    , RspIn(num_inputs,   this)
+    , ReqOut(num_outputs, this)    
+    , RspOut(num_outputs, this)
    , type_(type)
    , delay_(delay)
-    , cursor_(0)
-    , tag_shift_(log2ceil(num_inputs))
-    , ReqIn(num_inputs, this)
-    , ReqOut(this)
-    , RspIn(this)    
-    , RspOut(num_inputs, this)
+    , cursors_(num_outputs, 0)
+    , lg_num_reqs_(log2ceil(num_inputs / num_outputs))
  {
-    assert(delay_ != 0);
-    assert(num_inputs <= MaxInputs);
-    if (num_inputs == 1) {
-      // bypass
-      ReqIn.at(0).bind(&ReqOut);
-      RspIn.bind(&RspOut.at(0));
+    assert(delay != 0);    
+    assert(num_inputs <= 32);
+    assert(num_outputs <= 32);
+    assert(num_inputs >= num_outputs);
+
+    if (num_inputs == num_outputs) {
+      // bypass mode
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        ReqIn.at(i).bind(&ReqOut.at(i));
+        RspOut.at(i).bind(&RspIn.at(i));
+      }
    }
  }

  void reset() {
-    cursor_ = 0;
+    for (auto& cursor : cursors_) {
+      cursor = 0;
+    }
  }

-  void tick() {  
-    if (ReqIn.size() == 1)
+  void tick() {
+    uint32_t I = ReqIn.size();
+    uint32_t O = ReqOut.size();
+    uint32_t R = 1 << lg_num_reqs_;
+
+    // skip bypass mode
+    if (I == O)
      return;
        
-    // process incomming requests    
-    for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {      
-      uint32_t j = (cursor_ + i) % n;
-      auto& req_in = ReqIn.at(j);      
-      if (!req_in.empty()) {
-        auto& req = req_in.front();
-        if (tag_shift_) {
-          req.tag = (req.tag << tag_shift_) | j;
+    // process incomming requests        
+    for (uint32_t o = 0; o < O; ++o) {
+      for (uint32_t r = 0; r < R; ++r) {
+        uint32_t i = (cursors_.at(o) + r) & (R-1);
+        uint32_t j = o * R + i;
+        if (j >= I)
+          continue;
+        
+        auto& req_in = ReqIn.at(j);
+        if (!req_in.empty()) {
+          auto& req = req_in.front();
+          if (lg_num_reqs_ != 0) {
+            req.tag = (req.tag << lg_num_reqs_) | i;
+          }
+          DT(4, this->name() << "-" << req);
+          ReqOut.at(o).send(req, delay_);                
+          req_in.pop();
+          this->update_cursor(o, i);
+          break;
        }
-        ReqOut.send(req, delay_);                
-        req_in.pop();
-        this->update_cursor(j);
-        break;
      }
-    } 
-
-    // process incoming reponses
-    if (!RspIn.empty()) {
-      auto& rsp = RspIn.front();    
-      uint32_t port_id = 0;
-      if (tag_shift_) {
-        port_id = rsp.tag & ((1 << tag_shift_)-1);
-        rsp.tag >>= tag_shift_;
-      }      
-      RspOut.at(port_id).send(rsp, 1);
-      RspIn.pop();
+      
+      // process incoming reponses
+      if (!RspOut.at(o).empty()) {
+        auto& rsp = RspOut.at(o).front();
+        uint32_t i = 0;
+        if (lg_num_reqs_ != 0) {
+          i = rsp.tag & (R-1);
+          rsp.tag >>= lg_num_reqs_;
+        }      
+        DT(4, this->name() << "-" << rsp);
+        uint32_t j = o * R + i;
+        RspIn.at(j).send(rsp, 1);      
+        RspOut.at(o).pop();
+      }
    }
  }

-  void update_cursor(uint32_t grant) {
+  void update_cursor(uint32_t index, uint32_t grant) {
    if (type_ == ArbiterType::RoundRobin) {
-      cursor_ = grant + 1;
+      cursors_.at(index) = grant + 1;
    }
  }

-  std::vector<SimPort<Req>>  ReqIn;
-  SimPort<Req>              ReqOut;
-  SimPort<Rsp>               RspIn;    
-  std::vector<SimPort<Rsp>> RspOut;
+private:
+  ArbiterType type_;
+  uint32_t delay_;  
+  std::vector<uint32_t> cursors_;
+  uint32_t lg_num_reqs_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class SMemDemux : public SimObject<SMemDemux> {
+public:
+  SimPort<MemReq>  ReqIn;
+  SimPort<MemRsp>  RspIn;
+
+  SimPort<MemReq>  ReqSm;
+  SimPort<MemRsp>  RspSm;
+
+  SimPort<MemReq>  ReqDc;
+  SimPort<MemRsp>  RspDc;
+
+  SMemDemux(
+    const SimContext& ctx, 
+    const char* name, 
+    uint32_t delay = 1
+  ) : SimObject<SMemDemux>(ctx, name)    
+    , ReqIn(this)
+    , RspIn(this)
+    , ReqSm(this)
+    , RspSm(this)
+    , ReqDc(this)
+    , RspDc(this)
+    , delay_(delay)
+  {}
+
+  void reset() {}
+
+  void tick() {
+    // process incomming requests  
+    if (!ReqIn.empty()) {
+      auto& req = ReqIn.front();
+      DT(4, this->name() << "-" << req);
+      if (req.type == AddrType::Shared) {
+        ReqSm.send(req, delay_);
+      } else {
+        ReqDc.send(req, delay_);
+      }
+      ReqIn.pop();
+    }   
+      
+    // process incoming reponses
+    if (!RspSm.empty()) {
+      auto& rsp = RspSm.front();
+      DT(4, this->name() << "-" << rsp);
+      RspIn.send(rsp, 1);
+      RspSm.pop();
+    }
+    if (!RspDc.empty()) {
+      auto& rsp = RspDc.front();
+      DT(4, this->name() << "-" << rsp);
+      RspIn.send(rsp, 1);
+      RspDc.pop();
+    }
+  }
+
+private:
+  uint32_t delay_;
 };

 }
--- a/sim/simx/warp.cpp
+++ b/sim/simx/warp.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <stdlib.h>
 #include <unistd.h>
@@ -10,21 +23,25 @@

 using namespace vortex;

-Warp::Warp(Core *core, uint32_t id)
-    : id_(id)
+Warp::Warp(Core *core, uint32_t warp_id)
+    : warp_id_(warp_id)
+    , arch_(core->arch())
    , core_(core)
    , ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
-    , freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
+    , freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
    , vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
 {
-  this->clear();
+  this->reset();
 }

-void Warp::clear() {
-  active_ = false;
-  PC_ = STARTUP_ADDR;
+void Warp::reset() {
+  PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
+#if (XLEN == 64)
+  PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
+#endif
  tmask_.reset();  
-  for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i) {
+  issued_instrs_ = 0;
+  for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
    for (auto& reg : ireg_file_.at(i)) {
      reg = 0;
    }
@@ -35,31 +52,44 @@ void Warp::clear() {
      reg = 0;
    }
  }
+  uui_gen_.reset();
 }

-void Warp::eval(pipeline_trace_t *trace) {
+pipeline_trace_t* Warp::eval() {
  assert(tmask_.any());

-  DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
-  for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i)
-    DPN(2, tmask_.test(n-i-1));
-  DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
-
-  /* Fetch and decode. */    
+#ifndef NDEBUG
+  uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
+  uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
+  uint32_t instr_id  = instr_uuid & 0xffff;
+  uint32_t instr_ref = instr_uuid >> 16;
+  uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
+#else
+  uint64_t uuid = 0;
+#endif
+  
+  DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
+  for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
+    DPN(1, tmask_.test(i));
+  DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);

+  // Fetch
  uint32_t instr_code = 0;
  core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
-  auto instr = core_->decoder().decode(instr_code);
+
+  // Decode
+  auto instr = core_->decoder_.decode(instr_code);
  if (!instr) {
-    std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
+    std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
    std::abort();
  }  

-  DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
+  DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);

-  // Update trace
+  // Create trace
+  auto trace = new pipeline_trace_t(uuid, arch_);
  trace->cid   = core_->id();
-  trace->wid   = id_;
+  trace->wid   = warp_id_;
  trace->PC    = PC_;
  trace->tmask = tmask_;
  trace->rdest = instr->getRDest();
@@ -68,18 +98,20 @@ void Warp::eval(pipeline_trace_t *trace) {
  // Execute
  this->execute(*instr, trace);

-  DP(4, "Register state:");
-  for (uint32_t i = 0; i < core_->arch().num_regs(); ++i) {
-    DPN(4, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
+  DP(5, "Register state:");
+  for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
+    DPN(5, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
    // Integer register file
-    for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
-      DPN(4, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
+    for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
+      DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
    }
-    DPN(4, '|');
+    DPN(5, '|');
    // Floating point register file
-    for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
-      DPN(4, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
+    for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
+      DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
    }
-    DPN(4, std::endl);
+    DPN(5, std::endl);
  }  
+
+  return trace;
 }
--- a/sim/simx/warp.h
+++ b/sim/simx/warp.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #ifndef __WARP_H
 #define __WARP_H

@@ -7,28 +20,26 @@

 namespace vortex {

+class Arch;
 class Core;
 class Instr;
 class pipeline_trace_t;
+
 struct DomStackEntry {
  DomStackEntry(const ThreadMask &tmask, Word PC) 
    : tmask(tmask)
    , PC(PC)
-    , fallThrough(false)
-    , unanimous(false) 
+    , fallthrough(false)
  {}

-  DomStackEntry(const ThreadMask &tmask)
-      : tmask(tmask)
-      , PC(0)
-      , fallThrough(true)
-      , unanimous(false) 
+  DomStackEntry(const ThreadMask &tmask) 
+    : tmask(tmask)
+    , fallthrough(true)
  {}

  ThreadMask tmask;
  Word PC;
-  bool fallThrough;
-  bool unanimous;
+  bool fallthrough;
 };

 struct vtype {
@@ -40,72 +51,58 @@ struct vtype {

 class Warp {
 public:
-  Warp(Core *core, uint32_t id);
+  Warp(Core *core, uint32_t warp_id);

-  void clear();
-  
-  bool active() const {
-    return active_;
-  }
-
-  void suspend() {
-    active_ = false;
-  }
-
-  void activate() {
-    active_ = true;
-  }
-
-  std::size_t getActiveThreads() const {
-    if (active_)
-      return tmask_.count();
-    return 0;
-  }
+  void reset();

  uint32_t id() const {
-    return id_;
+    return warp_id_;
  }

-  uint32_t getPC() const {
+  Word getPC() const {
    return PC_;
  }

-  void setPC(uint32_t PC) {
+  void setPC(Word PC) {
    PC_ = PC;
  }

  void setTmask(size_t index, bool value) {
    tmask_.set(index, value);
-    active_ = tmask_.any();
  }

-  uint32_t getTmask() const {
-    if (active_)
-      return tmask_.to_ulong();
-    return 0;
+  uint64_t getTmask() const {
+    return tmask_.to_ulong();
  }

-  uint32_t getIRegValue(uint32_t reg) const {
+  Word getIRegValue(uint32_t reg) const {
    return ireg_file_.at(0).at(reg);
  }

-  void eval(pipeline_trace_t *);
+  uint64_t incr_instrs() {
+    return issued_instrs_++;
+  }
+
+  pipeline_trace_t* eval();

 private:

  void execute(const Instr &instr, pipeline_trace_t *trace);
+
+  UUIDGenerator uui_gen_;
  
-  uint32_t id_;
+  uint32_t warp_id_;
+  const Arch& arch_;
  Core *core_;
-  bool active_;
+  uint64_t issued_instrs_;
  
  Word PC_;
-  ThreadMask tmask_;  
-  
-  std::vector<std::vector<Word>> ireg_file_;
-  std::vector<std::vector<FWord>> freg_file_;
-  std::vector<std::vector<Byte>> vreg_file_;
-  std::stack<DomStackEntry> dom_stack_;
+  ThreadMask tmask_;
+
+  std::vector<std::vector<Word>>     ireg_file_;
+  std::vector<std::vector<uint64_t>> freg_file_;
+  std::vector<std::vector<Byte>>     vreg_file_;
+  std::stack<DomStackEntry>          ipdom_stack_;

  struct vtype vtype_;
  uint32_t vl_;