diff --git a/sim/common/simobject.h b/sim/common/simobject.h
index 68bccc87..487d385c 100644
--- a/sim/common/simobject.h
+++ b/sim/common/simobject.h
@@ -11,6 +11,128 @@ namespace vortex {
 
 class SimObjectBase;
 
+///////////////////////////////////////////////////////////////////////////////
+
+class SimPortBase {
+public:  
+  virtual ~SimPortBase() {}
+  
+  SimObjectBase* module() const {
+    return module_;
+  }
+
+  SimPortBase* peer() const {
+    return peer_;
+  }
+
+  bool connected() const {
+    return (peer_ != nullptr);
+  }
+
+protected:
+  SimPortBase(SimObjectBase* module)
+    : module_(module)
+    , peer_(nullptr)
+  {}
+
+  void connect(SimPortBase* peer) {
+    assert(peer_ == nullptr);
+    peer_ = peer;
+  }
+
+  void disconnect() {    
+    assert(peer_ == nullptr);  
+    peer_ = nullptr;
+  }
+
+  SimPortBase& operator=(const SimPortBase&) = delete;
+
+  SimObjectBase* module_;
+  SimPortBase*   peer_;
+
+  template <typename U> friend class SlavePort;
+  template <typename U> friend class MasterPort;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class SimPort : public SimPortBase {
+public:
+  void send(const Pkt& pkt, uint64_t delay) const; 
+
+  bool read(Pkt* out) {
+    if (!valid_)
+      return false;
+    *out = data_;
+    valid_ = false;
+    return true;
+  }
+
+protected:
+  SimPort(SimObjectBase* module)
+    : SimPortBase(module)
+    , valid_(false)
+  {}
+
+  void write(const Pkt& data) {
+    assert(!valid_);
+    data_  = data;
+    valid_ = true;
+  }
+
+  SimPort& operator=(const SimPort&) = delete;
+
+  Pkt data_;
+  bool valid_;
+
+  template <typename U> friend class SimPortEvent;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class SlavePort : public SimPort<Pkt> {
+public:
+  SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
+
+  void bind(SlavePort<Pkt>* peer) {
+    this->connect(peer);
+  }
+
+  void unbind() {    
+    this->disconnect();
+  }
+
+protected:
+  SlavePort& operator=(const SlavePort&) = delete;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class MasterPort : public SimPort<Pkt> {
+public:
+  MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}
+
+  void bind(SlavePort<Pkt>* peer) {
+    this->connect(peer);
+  }
+
+  void bind(MasterPort<Pkt>* peer) {
+    this->connect(peer);
+  }
+
+  void unbind() {    
+    this->disconnect();
+  }
+
+protected:
+  MasterPort& operator=(const MasterPort&) = delete;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
 class SimEventBase {
 public:
   typedef std::shared_ptr<SimEventBase> Ptr;
@@ -32,16 +154,16 @@ protected:
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename Pkt>
-class SimSimpleEvent : public SimEventBase {
+class SimCallEvent : public SimEventBase {
 public:
   typedef std::function<void (const Pkt&)> Func;
 
   template <typename... Args>
   static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) {
-    return std::make_shared<SimSimpleEvent>(func, pkt, delay);
+    return std::make_shared<SimCallEvent>(func, pkt, delay);
   }   
 
-  SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay) 
+  SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay) 
     : SimEventBase(delay)
     , func_(func)
     , pkt_(pkt)
@@ -61,167 +183,23 @@ protected:
 template <typename Pkt>
 class SimPortEvent : public SimEventBase {
 public:
-  typedef std::function<void (const Pkt&, uint32_t)> Func;
-
-  template <typename... Args>
-  static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) {
-    return std::make_shared<SimPortEvent>(func, pkt, port_id, delay);
+  static Ptr Create(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) {
+    return std::make_shared<SimPortEvent>(port, pkt, delay);
   }
 
-  SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) 
+  SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) 
     : SimEventBase(delay) 
-    , func_(func)
+    , port_(port)
     , pkt_(pkt)
-    , port_id_(port_id)
   {}
   
   void fire() const override {
-    func_(pkt_, port_id_);
+    const_cast<SimPort<Pkt>*>(port_)->write(pkt_);
   }
 
 private:  
-  Func     func_;
-  Pkt      pkt_;  
-  uint32_t port_id_;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-class SimPortBase {
-public:
-  typedef std::shared_ptr<SimPortBase> Ptr;  
-
-  virtual ~SimPortBase() {}
-  
-  SimObjectBase* module() const {
-    return module_;
-  }
-  
-  uint32_t port_id() const {
-    return port_id_;
-  }
-
-  SimPortBase* peer() const {
-    return peer_;
-  }
-
-  bool connected() const {
-    return (peer_ != nullptr);
-  }
-
-  bool is_slave() const {
-    return is_slave_;
-  }
-
-protected:
-
-  SimPortBase(SimObjectBase* module, bool is_slave);
-
-  void connect(SimPortBase* peer) {
-    assert(peer_ == nullptr);
-    peer_ = peer;
-  }
-
-  void disconnect() { 
-    assert(peer_ == nullptr);  
-    peer_ = nullptr;
-  }
-
-  SimObjectBase* module_;
-  uint32_t       port_id_;
-  bool           is_slave_;
-  SimPortBase*   peer_;
-
-  template <typename Pkt> friend class MasterPort;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename Pkt>
-class SlavePort : public SimPortBase {
-public:
-  typedef std::shared_ptr<SlavePort<Ptr>> Ptr;
-  typedef std::function<void (const Pkt&, uint32_t)> Func;
-
-  static Ptr Create(SimObjectBase* module, const Func& func) {
-    return std::make_shared<SlavePort<Pkt>>(module, func);
-  }
-
-  template <typename T>
-  static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) {
-    return std::make_shared<SlavePort<Pkt>>(module, obj, entry);
-  } 
-
-  SlavePort(SimObjectBase* module, const Func& func)
-    : SimPortBase(module, true)
-    , func_(func)
-  {}
-
-  template <typename T>
-  SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t))
-    : SimPortBase(module, true)
-    , func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2))
-  {}
-
-  SlavePort(SimObjectBase* module, SlavePort* peer) 
-    : SimPortBase(module, false) 
-  {
-    this->connect(peer);
-  }
-
-  void send(const Pkt& pkt, uint64_t delay) const;
-
-  const Func& func() const {
-    return func_;
-  }
-
-protected:
-  SlavePort& operator=(const SlavePort&);
-  Func func_;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename Pkt>
-class MasterPort : public SimPortBase {
-public:
-  typedef std::shared_ptr<MasterPort<Ptr>> Ptr;
-  typedef std::function<void (const Pkt&, uint32_t)> Func;
-
-  static Ptr Create() {
-    return std::make_shared<MasterPort<Ptr>>(module);
-  }  
-
-  MasterPort(SimObjectBase* module) : SimPortBase(module, false) {}
-
-  MasterPort(SimObjectBase* module, MasterPort* peer) 
-    : SimPortBase(module, false) 
-  {
-    peer->connect(this);
-  }
-
-  void bind(SlavePort<Pkt>* peer) {
-    this->connect(peer);
-  }
-
-  void unbind() {    
-    peer_->disconnect();
-    this->disconnect();
-  }
-
-  void send(const Pkt& pkt, uint64_t delay) const {
-    assert(peer_ != nullptr);
-    if (peer_->is_slave()) {
-      auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
-      slave->send(pkt, delay);
-    } else {
-      auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
-      master->send(pkt, delay);
-    }  
-  }
-
-private:
-  MasterPort& operator=(const MasterPort&);
+  const SimPort<Pkt>* port_; 
+  Pkt pkt_;
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -237,25 +215,18 @@ public:
   template <typename T, typename Pkt>
   void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay);
 
-  virtual void step(uint64_t cycle) = 0;
-
   const std::string& name() const {
     return name_;
   }
 
 protected:
 
-  SimObjectBase(const SimContext& ctx, const char* name);
+  virtual void step(uint64_t cycle) = 0;
 
-  uint32_t allocate_port(SimPortBase* port) {
-      uint32_t id = ports_.size();
-      ports_.push_back(port);
-      return id;
-  }
+  SimObjectBase(const SimContext& ctx, const char* name);
 
 private:
   std::string name_;
-  std::vector<SimPortBase*> ports_;
 
   friend class SimPlatform;
   friend class SimPortBase;
@@ -320,20 +291,19 @@ public:
   }
 
   template <typename Pkt>
-  void schedule(const typename SimSimpleEvent<Pkt>::Func& callback, 
+  void schedule(const typename SimCallEvent<Pkt>::Func& callback, 
                 const Pkt& pkt, 
                 uint64_t delay) {    
-    auto evt = SimSimpleEvent<Pkt>::Create(callback, pkt, delay);
+    auto evt = SimCallEvent<Pkt>::Create(callback, pkt, delay);
     assert(delay != 0);
     events_.emplace_back(evt);
   }
 
   template <typename Pkt>
-  void schedule(const typename SimPortEvent<Pkt>::Func& callback, 
+  void schedule(const SimPort<Pkt>* port, 
                 const Pkt& pkt, 
-                uint32_t port_id, 
                 uint64_t delay) {
-    auto evt = SimPortEvent<Pkt>::Create(callback, pkt, port_id, delay);
+    auto evt = SimPortEvent<Pkt>::Create(port, pkt, delay);
     assert(delay != 0);
     events_.emplace_back(evt);
   }
@@ -383,13 +353,6 @@ private:
 
 ///////////////////////////////////////////////////////////////////////////////
 
-inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave) 
-  : module_(module)  
-  , port_id_(module->allocate_port(this))
-  , is_slave_(is_slave)
-  , peer_(nullptr) 
-{}
-
 inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) 
   : name_(name) 
 {}
@@ -403,18 +366,11 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
 }
 
 template <typename Pkt>
-void SlavePort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
-  if (func_) {
-    SimPlatform::instance().schedule(func_, pkt, port_id_, delay);
+void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
+  if (peer_) {
+    reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);    
   } else {
-    assert(peer_ != nullptr);
-    if (peer_->is_slave()) {
-      auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
-      slave->send(pkt, delay);
-    } else {
-      auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
-      master->send(pkt, delay);
-    }
+    SimPlatform::instance().schedule(this, pkt, delay);
   }  
 }
 
diff --git a/sim/simX/Makefile b/sim/simX/Makefile
index e42464c6..75a4a495 100644
--- a/sim/simX/Makefile
+++ b/sim/simX/Makefile
@@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
 TOP = vx_cache_sim
 
 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp 
-SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp
+SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp
 
 OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
 VPATH := $(sort $(dir $(SRCS)))
diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp
index f139cb43..503d32c5 100644
--- a/sim/simX/cache.cpp
+++ b/sim/simX/cache.cpp
@@ -1,5 +1,6 @@
 #include "cache.h"
 #include "debug.h"
+#include "types.h"
 #include <util.h>
 #include <unordered_map>
 #include <vector>
@@ -30,8 +31,7 @@ struct params_t {
         uint32_t offset_bits = config.B - config.W;
         uint32_t log2_bank_size  = config.C - bank_bits;
         uint32_t index_bits  = log2_bank_size - (config.B << config.A);        
-        assert(log2_bank_size >= config.B);
-        
+        assert(log2_bank_size >= config.B);        
         
         this->words_per_block = 1 << offset_bits;
         this->blocks_per_set  = 1 << config.A;
@@ -229,9 +229,10 @@ private:
     CacheConfig config_;
     params_t params_;
     std::vector<bank_t> banks_;
-    std::vector<std::pair<bool, MemReq>> core_reqs_;
-    std::pair<bool, MemRsp> mem_rsp_;
     std::vector<std::queue<uint32_t>> core_rsps_;
+    Switch<MemReq, MemRsp>::Ptr mem_switch_;
+    std::vector<MasterPort<MemReq>> mem_req_ports_;
+    std::vector<SlavePort<MemRsp>>  mem_rsp_ports_;
 
 public:
     Impl(Cache* simobject, const CacheConfig& config) 
@@ -239,16 +240,22 @@ public:
         , config_(config)
         , params_(config)
         , banks_(config.num_banks, {config, params_})
-        , core_reqs_(config.num_inputs)
         , core_rsps_(config.num_inputs)
-    {}    
-
-    void handleMemResponse(const MemRsp& response, uint32_t) {        
-        mem_rsp_ = {true, response};
-    }
-
-    void handleCoreRequest(const MemReq& request, uint32_t port_id) {
-        core_reqs_.at(port_id) = {true, request};
+        , mem_req_ports_(config.num_banks, simobject)
+        , mem_rsp_ports_(config.num_banks, simobject)
+    {
+        if (config.num_banks > 1) {
+            mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
+            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
+                mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
+                mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
+            }    
+            mem_switch_->ReqOut.bind(&simobject->MemReqPort);
+            simobject->MemRspPort.bind(&mem_switch_->RspIn);
+        } else {
+            mem_req_ports_.at(0).bind(&simobject->MemReqPort);
+            simobject->MemRspPort.bind(&mem_rsp_ports_.at(0));
+        }
     }
 
     void step(uint64_t /*cycle*/) {
@@ -269,31 +276,29 @@ public:
                 bank.mshr.try_pop(&active_req);
             }
 
-            // try schedule stall replay
+            // try schedule stall queue if MSHR has space
             if (!active_req.valid 
-             && !bank.stall_buffer.empty()) {            
+             && !bank.stall_buffer.empty()
+             && !bank.mshr.full()) {            
                 active_req = bank.stall_buffer.front();
                 bank.stall_buffer.pop();
             }
         }
 
         // handle memory fills
-        if (mem_rsp_.first) {
-            mem_rsp_.first = false;
-            auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15);
-            auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31);
-            this->processMemoryFill(bank_id, mshr_id);        
+        for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) {
+            MemRsp mem_rsp;
+            if (mem_rsp_ports_.at(i).read(&mem_rsp)) {
+                this->processMemoryFill(i, mem_rsp.tag);
+            }
         }
         
         // handle incoming core requests
-        for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) {
-            auto& entry = core_reqs_.at(i);
-            if (!entry.first)
+        for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) {
+            MemReq core_req;
+            if (!simobject_->CoreReqPorts.at(i).read(&core_req))
                 continue;
-                
-            entry.first = false;
 
-            auto& core_req = entry.second;
             auto bank_id   = params_.addr_bank_id(core_req.addr);
             auto set_id    = params_.addr_set_id(core_req.addr);
             auto tag       = params_.addr_tag(core_req.addr);
@@ -417,7 +422,7 @@ public:
                         mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
                         mem_req.write = true;
                         mem_req.tag   = 0;
-                        simobject_->MemReqPort.send(mem_req, 1);
+                        mem_req_ports_.at(bank_id).send(mem_req, 1);
                     } else {
                         // mark block as dirty
                         hit_block.dirty = true;
@@ -438,7 +443,8 @@ public:
                         MemReq mem_req;
                         mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
                         mem_req.write = true;
-                        simobject_->MemReqPort.send(mem_req, 1);
+                        mem_req.tag   = 0;
+                        mem_req_ports_.at(bank_id).send(mem_req, 1);
                     }
                 }
 
@@ -449,7 +455,7 @@ public:
                         mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
                         mem_req.write = true;
                         mem_req.tag   = 0;
-                        simobject_->MemReqPort.send(mem_req, 1);
+                        mem_req_ports_.at(bank_id).send(mem_req, 1);
                     }
                     // send core response
                     for (auto& info : active_req.infos) {
@@ -467,9 +473,8 @@ public:
                         MemReq mem_req;
                         mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
                         mem_req.write = active_req.write;
-                        mem_req.tag = bit_setw(0,            0, 15, bank_id);
-                        mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id);
-                        simobject_->MemReqPort.send(mem_req, 1);
+                        mem_req.tag   = mshr_id;
+                        mem_req_ports_.at(bank_id).send(mem_req, 1);
                     }
                 }
             }
@@ -480,12 +485,12 @@ public:
 ///////////////////////////////////////////////////////////////////////////////
 
 Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) 
-    : SimObject<Cache>(ctx, name)
-    , impl_(new Impl(this, config))
-    , CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest})
+    : SimObject<Cache>(ctx, name)    
+    , CoreReqPorts(config.num_inputs, this)
     , CoreRspPorts(config.num_inputs, this)
     , MemReqPort(this)
-    , MemRspPort(this, impl_, &Impl::handleMemResponse)
+    , MemRspPort(this)
+    , impl_(new Impl(this, config))
 {}
 
 Cache::~Cache() {
diff --git a/sim/simX/cache.h b/sim/simX/cache.h
index 1c0c82f6..58767d9f 100644
--- a/sim/simX/cache.h
+++ b/sim/simX/cache.h
@@ -20,11 +20,7 @@ struct CacheConfig {
     uint8_t latency;        // pipeline latency 
 };
 
-class Cache : public SimObject<Cache> {
-private:
-    class Impl;
-    Impl* impl_;
-    
+class Cache : public SimObject<Cache> {  
 public:
     Cache(const SimContext& ctx, const char* name, const CacheConfig& config);
     ~Cache();
@@ -35,6 +31,10 @@ public:
     std::vector<MasterPort<MemRsp>> CoreRspPorts;
     MasterPort<MemReq>              MemReqPort;
     SlavePort<MemRsp>               MemRspPort;
+    
+private:
+    class Impl;
+    Impl* impl_;
 };
 
 }
\ No newline at end of file
diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp
index af0a4441..e1333dac 100644
--- a/sim/simX/core.cpp
+++ b/sim/simX/core.cpp
@@ -12,13 +12,13 @@
 
 using namespace vortex;
 
-Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
+Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
     : SimObject(ctx, "Core")
     , id_(id)
     , arch_(arch)
-    , decoder_(decoder)
-    , mem_(mem)
-    , shared_mem_(1, SMEM_SIZE)
+    , decoder_(arch)
+    , mmu_(0, arch.wsize(), true)
+    , shared_mem_(4096)
     , warps_(arch.num_warps())
     , barriers_(arch.num_barriers(), 0)
     , csrs_(arch.num_csrs(), 0)
@@ -54,9 +54,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU
         DCACHE_MSHR_SIZE,       // mshr
         2,                      // pipeline latency
       }))
-    , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2)) 
-    , icache_rsp_port_(this, this, &Core::icache_handleCacheReponse)
-    , dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast<LsuUnit*>(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse})
+    , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
     , fetch_stage_("fetch")
     , decode_stage_("decode")
     , issue_stage_("issue")
@@ -65,36 +63,34 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU
     , pending_icache_(arch_.num_warps())
     , stalled_warps_(0)
     , last_schedule_wid_(0)
-    , pending_instrs_(0)
+    , issued_instrs_(0)
+    , committed_instrs_(0)
     , ebreak_(false)   
     , stats_insts_(0)
     , stats_loads_(0)
     , stats_stores_(0)
-    , MemRspPort(this, &l1_mem_switch_->RspIn)
-    , MemReqPort(this, &l1_mem_switch_->ReqOut)    
+    , MemRspPort(this)
+    , MemReqPort(this)    
 {  
   for (int i = 0; i < arch_.num_warps(); ++i) {
     warps_.at(i) = std::make_shared<Warp>(this, i);
   }
 
   // register execute units
+  exe_units_.at((int)ExeType::NOP) = std::make_shared<NopUnit>(this);
   exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
   exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
   exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
   exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);  
   exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);
 
-  // connect l1 caches
-  icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_);
-  for (int i = 0; i < arch_.num_threads(); ++i) {
-    dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i));
-  }
-
   // connect l1 switch
   icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
   dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]);
   l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort);  
   l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort);
+  this->MemRspPort.bind(&l1_mem_switch_->RspIn);
+  l1_mem_switch_->ReqOut.bind(&this->MemReqPort);
 
   // activate warp0
   warps_.at(0)->setTmask(0, true);
@@ -109,31 +105,24 @@ Core::~Core() {
   }
 }
 
-void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) {
-  // advance to decode stage
-  uint32_t wid = response.tag;
-  pipeline_state_t state;
-  pending_icache_.remove(wid, &state);
-  auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
-  state.icache_latency = latency;
-  decode_stage_.push(state);
+void Core::attach_ram(RAM* ram) {
+  // bind RAM to memory unit
+  mmu_.attach(*ram, 0, 0xFFFFFFFF);    
 }
 
 void Core::step(uint64_t cycle) {
-    __unused (cycle);
-  D(2, "###########################################################");
-  D(2, std::dec << "Core" << id_ << ": cycle: " << cycle);
-
-  this->commit();
-  this->execute();
-  this->issue();
-  this->decode();
-  this->fetch();
+  this->commit(cycle);
+  this->execute(cycle);
+  this->issue(cycle);
+  this->decode(cycle);
+  this->fetch(cycle);
 
   DPN(2, std::flush);
 }
 
-void Core::warp_scheduler() {
+void Core::warp_scheduler(uint64_t cycle) {
+  __unused (cycle);
+
   bool foundSchedule = false;
   int scheduled_warp = last_schedule_wid_;
 
@@ -159,53 +148,77 @@ void Core::warp_scheduler() {
   stats_insts_ += warp->getActiveThreads();
   
   pipeline_state_t state;
+  state.clear();
+  state.id = (issued_instrs_++ * arch_.num_cores()) + id_;
+
   warp->eval(&state);
 
-  D(4, state);  
+  DT(3, cycle, "pipeline-schedule: " << state);
 
-  // advance to fetch stage
-  ++pending_instrs_;
+  // advance to fetch stage  
   fetch_stage_.push(state);
 }
 
-void Core::fetch() {
-  // schedule icache request
-  pipeline_state_t state;
-  if (fetch_stage_.try_pop(&state)) {
-    state.icache_latency = SimPlatform::instance().cycles();
-    MemReq mem_req;
-    mem_req.addr  = state.PC;
-    mem_req.write = false;
-    mem_req.tag   = pending_icache_.allocate(state);    
-    icache_->CoreReqPorts.at(0).send(mem_req, 1);
+void Core::fetch(uint64_t cycle) {
+  // handle icache reponse
+  {
+    MemRsp mem_rsp;
+    if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){
+      pipeline_state_t state;
+      pending_icache_.remove(mem_rsp.tag, &state);
+      auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
+      state.icache_latency = latency;
+      decode_stage_.push(state);
+      DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state);
+    }
+  }
+
+  // send icache request
+  {
+    pipeline_state_t state;
+    if (fetch_stage_.try_pop(&state)) {
+      state.icache_latency = SimPlatform::instance().cycles();
+      MemReq mem_req;
+      mem_req.addr  = state.PC;
+      mem_req.write = false;
+      mem_req.tag   = pending_icache_.allocate(state);    
+      icache_->CoreReqPorts.at(0).send(mem_req, 1);
+      DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state);
+    }
   }  
 
   // schedule next warp
-  this->warp_scheduler();  
+  this->warp_scheduler(cycle);  
 }
 
-void Core::decode() {
+void Core::decode(uint64_t cycle) {
+  __unused (cycle);
+
   pipeline_state_t state;
   if (!decode_stage_.try_pop(&state))
     return;    
   
-  if (state.stall_warp) {
-    D(3, "*** warp#" << state.wid << " fetch stalled");
-  } else {
-    // release warp
+  // release warp
+  if (!state.stall_warp) {
     stalled_warps_.reset(state.wid);
   }
+
+  DT(3, cycle, "pipeline-decode: " << state);
   
   // advance to issue stage
   issue_stage_.push(state);
 }
 
-void Core::issue() {
+void Core::issue(uint64_t cycle) {
+  __unused (cycle);
+
   if (!issue_stage_.empty()) {
     // insert to ibuffer 
     auto& state = issue_stage_.top();
     auto& ibuffer = ibuffers_.at(state.wid);
-    if (!ibuffer.full()) {
+    if (ibuffer.full()) {
+      DT(3, cycle, "*** ibuffer-stall: " << state);
+    } else {
       ibuffer.push(state);
       issue_stage_.pop();
     }
@@ -219,8 +232,18 @@ void Core::issue() {
     auto& state = ibuffer.top();
 
     // check scoreboard
-    if (scoreboard_.in_use(state))
+    if (scoreboard_.in_use(state)) {
+      DTH(3, cycle, "*** scoreboard-stall: dependents={");
+      auto owners = scoreboard_.owners(state);
+      for (uint32_t i = 0, n = owners.size(); i < n; ++i) {
+        if (i) DTN(3, ", ");
+        DTN(3, "#" << owners.at(i));  
+      }
+      DTN(3, "}, " << state << std::endl);
       continue;
+    }
+
+    DT(3, cycle, "pipeline-issue: " << state);
 
     // update scoreboard
     scoreboard_.reserve(state);
@@ -233,18 +256,19 @@ void Core::issue() {
   }
 }
 
-void Core::execute() {
+void Core::execute(uint64_t cycle) {
   // process stage inputs
   if (!execute_stage_.empty()) {
     auto& state = execute_stage_.top();
     auto& exe_unit = exe_units_.at((int)state.exe_type);
     exe_unit->push_input(state);
     execute_stage_.pop();
+    DT(3, cycle, "pipeline-execute: " << state);
   }
 
   // advance execute units
   for (auto& exe_unit : exe_units_) {
-    exe_unit->step();
+    exe_unit->step(cycle);
   }  
   
   // commit completed instructions
@@ -255,18 +279,29 @@ void Core::execute() {
         stalled_warps_.reset(state.wid);
       }
       // advance to commit stage
-      commit_stage_.push(state);      
+      commit_stage_.push(state);   
     }
   }
 }
 
-void Core::commit() {
+void Core::commit(uint64_t cycle) {
+  __unused (cycle);
+  
   pipeline_state_t state;
   if (!commit_stage_.try_pop(&state))
     return;
 
+  DT(3, cycle, "pipeline-commit: " << state);
+
   // update scoreboard
   scoreboard_.release(state);
+
+  assert(committed_instrs_ <= issued_instrs_);
+  ++committed_instrs_;
+}
+
+bool Core::running() const {
+  return (committed_instrs_ != issued_instrs_);
 }
 
 Word Core::get_csr(Addr addr, int tid, int wid) {
@@ -349,9 +384,9 @@ void Core::barrier(int bar_id, int count, int warp_id) {
   barrier.reset();
 }
 
-Word Core::icache_fetch(Addr addr) {
+Word Core::icache_read(Addr addr, Size size) {
   Word data;
-  mem_.read(&data, addr, sizeof(Word), 0);
+  mmu_.read(&data, addr, size, 0);
   return data;
 }
 
@@ -365,7 +400,7 @@ Word Core::dcache_read(Addr addr, Size size) {
      return data;
   }
 #endif
-  mem_.read(&data, addr, size, 0);
+  mmu_.read(&data, addr, size, 0);
   return data;
 }
 
@@ -383,11 +418,7 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
      this->writeToStdOut(addr, data);
      return;
   }
-  mem_.write(&data, addr, size, 0);
-}
-
-bool Core::running() const {
-  return pending_instrs_;
+  mmu_.write(&data, addr, size, 0);
 }
 
 void Core::printStats() const {
@@ -399,7 +430,7 @@ void Core::printStats() const {
 
 void Core::writeToStdOut(Addr addr, Word data) {
   uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
-  auto& ss_buf = print_bufs_.at(tid);
+  auto& ss_buf = print_bufs_[tid];
   char c = (char)data;
   ss_buf << c;
   if (c == '\n') {
diff --git a/sim/simX/core.h b/sim/simX/core.h
index 913db4a6..ea1a6582 100644
--- a/sim/simX/core.h
+++ b/sim/simX/core.h
@@ -25,9 +25,11 @@ namespace vortex {
 
 class Core : public SimObject<Core> {
 public:
-  Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id);
+  Core(const SimContext& ctx, const ArchDef &arch, Word id);
   ~Core();
 
+  void attach_ram(RAM* ram);
+
   bool running() const;
 
   void step(uint64_t cycle);
@@ -64,7 +66,7 @@ public:
 
   void barrier(int bar_id, int count, int warp_id);
 
-  Word icache_fetch(Addr);
+  Word icache_read(Addr, Size);
 
   Word dcache_read(Addr, Size);
 
@@ -76,22 +78,21 @@ public:
 
 private:
 
-  void fetch();
-  void decode();
-  void issue();
-  void execute();
-  void commit();
+  void fetch(uint64_t cycle);
+  void decode(uint64_t cycle);
+  void issue(uint64_t cycle);
+  void execute(uint64_t cycle);
+  void commit(uint64_t cycle);
 
-  void warp_scheduler();
-
-  void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id);
+  void warp_scheduler(uint64_t cycle);
 
   void writeToStdOut(Addr addr, Word data);
 
   Word id_;
-  const ArchDef& arch_;
-  const Decoder& decoder_;
-  MemoryUnit& mem_;
+  const ArchDef arch_;
+  const Decoder decoder_;
+  MemoryUnit mmu_;
+
 #ifdef SM_ENABLE
   RAM shared_mem_;
 #endif 
@@ -106,8 +107,6 @@ private:
   Cache::Ptr icache_;
   Cache::Ptr dcache_;
   Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
-  SlavePort<MemRsp> icache_rsp_port_;
-  std::vector<SlavePort<MemRsp>> dcache_rsp_port_;
 
   PipelineStage fetch_stage_;
   PipelineStage decode_stage_;
@@ -118,10 +117,12 @@ private:
   HashTable<pipeline_state_t> pending_icache_;
   WarpMask stalled_warps_;  
   uint32_t last_schedule_wid_;
-  uint32_t pending_instrs_;
+  uint32_t issued_instrs_;
+  uint32_t committed_instrs_;
   bool ebreak_;
 
   std::unordered_map<int, std::stringstream> print_bufs_;
+  
   uint64_t stats_insts_;
   uint64_t stats_loads_;
   uint64_t stats_stores_;
diff --git a/sim/simX/debug.h b/sim/simX/debug.h
index ad7fd16f..53d2d62a 100644
--- a/sim/simX/debug.h
+++ b/sim/simX/debug.h
@@ -7,14 +7,15 @@
 #define DEBUG_HEADER << "DEBUG "
 //#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
 
+#define TRACE_HEADER << "TRACE "
+//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
+
 #ifndef NDEBUG
 
 #include <iostream>
 #include <iomanip>
 
-#define DX(x) x
-
-#define D(lvl, x) do { \
+#define DP(lvl, x) do { \
   if ((lvl) <= DEBUG_LEVEL) { \
     std::cout DEBUG_HEADER << x << std::endl; \
   } \
@@ -32,12 +33,33 @@
   } \
 } while(0)
 
+#define DT(lvl, t, x) do { \
+  if ((lvl) <= DEBUG_LEVEL) { \
+    std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \
+  } \
+} while(0)
+
+#define DTH(lvl, t, x) do { \
+  if ((lvl) <= DEBUG_LEVEL) { \
+    std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \
+  } \
+} while(0)
+
+#define DTN(lvl, x) do { \
+  if ((lvl) <= DEBUG_LEVEL) { \
+    std::cout << x; \
+  } \
+} while(0)
+
+
 #else
 
-#define DX(x)
-#define D(lvl, x) do {} while(0)
+#define DP(lvl, x) do {} while(0)
 #define DPH(lvl, x) do {} while(0)
 #define DPN(lvl, x) do {} while(0)
-#define D_RAW(x) do {} while(0)
+
+#define DT(lvl, t, x) do {} while(0)
+#define DTH(lvl, t, x) do {} while(0)
+#define DTN(lvl, x) do {} while(0)
 
 #endif
\ No newline at end of file
diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp
index 3c76231f..6530d223 100644
--- a/sim/simX/decode.cpp
+++ b/sim/simX/decode.cpp
@@ -194,47 +194,26 @@ static const char* op_string(const Instr &instr) {
 namespace vortex {
 std::ostream &operator<<(std::ostream &os, const Instr &instr) {
   os << op_string(instr) << ": ";
-  auto opcode = instr.getOpcode();
-    
-  auto rd_to_string = [&]() {
-    int rdt = instr.getRDType();
-    int rd = instr.getRDest();
-    switch (rdt) {
-    case 1: os << "r" << std::dec << rd << " <- "; break;
-    case 2: os << "fr" << std::dec << rd << " <- "; break;
-    case 3: os << "vr" << std::dec << rd << " <- "; break;
-    default: break;
-    }
-  };
-
-  auto rs_to_string = [&](int i) {
-    int rst = instr.getRSType(i);
-    int rs = instr.getRSrc(i);    
-    switch (rst) {
-    case 1: os << "r" << std::dec << rs; break;
-    case 2: os << "fr" << std::dec << rs; break;
-    case 3: os << "vr" << std::dec << rs; break;
-    default: break;
-    }
-  };
-
+  auto opcode = instr.getOpcode();    
   if (opcode == S_INST 
    || opcode == FS
    || opcode == VS) {     
      os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
-     rs_to_string(1);
+     os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
   } else 
   if (opcode == L_INST 
    || opcode == FL
    || opcode == VL) {     
-     rd_to_string();
+     os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
      os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
   } else {
-    rd_to_string();
+    if (instr.getRDType() != RegType::None) {
+      os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
+    }
     int i = 0;
     for (; i < instr.getNRSrc(); ++i) {    
       if (i) os << ", ";
-      rs_to_string(i);
+      os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
     }    
     if (instr.hasImm()) {
       if (i) os << ", ";
@@ -281,7 +260,7 @@ Decoder::Decoder(const ArchDef &arch) {
   v_imm_mask_  = 0x7ff;  
 }
 
-std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {  
+std::shared_ptr<Instr> Decoder::decode(Word code) const {  
   auto instr = std::make_shared<Instr>();
   Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
   instr->setOpcode(op);
@@ -297,8 +276,8 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
 
   auto op_it = sc_instTable.find(op);
   if (op_it == sc_instTable.end()) {
-    std::cout << std::hex << "invalid opcode: 0x" << op << ", instruction=0x" << code << ", PC=" << PC << std::endl;
-    std::abort();
+    std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl;
+    return nullptr;
   }
 
   auto iType = op_it->second.iType;
@@ -459,7 +438,5 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
     std::abort();
   }
 
-  D(2, "Instr 0x" << std::hex << code << ": " << *instr << std::flush);
-
   return instr;
 }
diff --git a/sim/simX/decode.h b/sim/simX/decode.h
index d4f9f976..e481cb28 100644
--- a/sim/simX/decode.h
+++ b/sim/simX/decode.h
@@ -13,7 +13,7 @@ class Decoder {
 public:
   Decoder(const ArchDef &);    
   
-  std::shared_ptr<Instr> decode(Word code, Word PC) const;
+  std::shared_ptr<Instr> decode(Word code) const;
 
 private:
 
diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp
index 602f7f3a..ff705d82 100644
--- a/sim/simX/execute.cpp
+++ b/sim/simX/execute.cpp
@@ -75,11 +75,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
   if (num_rsrcs) {              
     for (int i = 0; i < num_rsrcs; ++i) {    
       DPH(2, "Src Reg [" << std::dec << i << "]: ");
-      int type = instr.getRSType(i);
+      auto type = instr.getRSType(i);
       int reg = instr.getRSrc(i);        
       switch (type) {
-      case 1: 
-        DPH(2, "r" << std::dec << reg << "={");
+      case RegType::Integer: 
+        DPN(2, "r" << std::dec << reg << "={");
         for (int t = 0; t < num_threads; ++t) {
           if (t) DPN(2, ", ");
           if (!tmask_.test(t)) {
@@ -91,8 +91,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
         }
         DPN(2, "}" << std::endl);
         break;
-      case 2: 
-        DPH(2, "fr" << std::dec << reg << "={");
+      case RegType::Float: 
+        DPN(2, "fr" << std::dec << reg << "={");
         for (int t = 0; t < num_threads; ++t) {
           if (t) DPN(2, ", ");
           if (!tmask_.test(t)) {
@@ -105,6 +105,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
         DPN(2, "}" << std::endl);
         break;
       default: 
+        std::abort();
         break;
       }      
     }
@@ -415,7 +416,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
     break;
   case L_INST:
     pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.load = 0;    
+    pipeline_state->lsu.type = LsuType::LOAD;
     pipeline_state->used_iregs[rsrc0] = 1;
     pipeline_state->mem_addrs.resize(num_threads);
     for (int t = 0; t < num_threads; ++t) {
@@ -425,7 +426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
       Word shift_by  = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
       Word data_read = core_->dcache_read(memAddr, 4);
       pipeline_state->mem_addrs.at(t) = memAddr;
-      D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
+      DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
       switch (func3) {
       case 0:
         // LBI
@@ -455,7 +456,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
     break;
   case S_INST:     
     pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.store = 1;    
+    pipeline_state->lsu.type = LsuType::STORE;
     pipeline_state->used_iregs[rsrc0] = 1;
     pipeline_state->used_iregs[rsrc1] = 1;
     pipeline_state->mem_addrs.resize(num_threads);
@@ -464,7 +465,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
         continue;
       Word memAddr = rsdata[t][0] + immsrc;
       pipeline_state->mem_addrs.at(t) = memAddr;
-      D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+      DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
       switch (func3) {
       case 0:
         // SB
@@ -543,12 +544,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
     break;
   case FENCE:
     pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.fence = 1;
+    pipeline_state->lsu.type = LsuType::FENCE;
     pipeline_state->stall_warp = true;
     break;
   case (FL | VL):
     pipeline_state->exe_type = ExeType::LSU;       
-    pipeline_state->lsu.load = 1;
+    pipeline_state->lsu.type = LsuType::LOAD;
     pipeline_state->used_iregs[rsrc0] = 1;    
     if (func3 == 0x2) {
       pipeline_state->mem_addrs.resize(num_threads);
@@ -558,14 +559,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
         Word memAddr = rsdata[t][0] + immsrc;
         pipeline_state->mem_addrs.at(t) = memAddr;
         Word data_read = core_->dcache_read(memAddr, 4);        
-        D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
+        DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
         rddata[t] = data_read;
       }
     } else {  
-      D(3, "Executing vector load");      
-      D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
-      D(3, "dest: v" << rdest);
-      D(3, "width" << instr.getVlsWidth());
+      DP(3, "Executing vector load");      
+      DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
+      DP(3, "dest: v" << rdest);
+      DP(3, "width" << instr.getVlsWidth());
       pipeline_state->mem_addrs.resize(vl_);
       auto &vd = vRegFile_.at(rdest);
       switch (instr.getVlsWidth()) {
@@ -574,9 +575,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
         for (int i = 0; i < vl_; i++) {
           Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
           pipeline_state->mem_addrs.at(i) = memAddr;
-          D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+          DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
           Word data_read = core_->dcache_read(memAddr, 4);
-          D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
+          DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
           int *result_ptr = (int *)(vd.data() + i);
           *result_ptr = data_read;            
         }
@@ -590,7 +591,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
     break;
   case (FS | VS):
     pipeline_state->exe_type = ExeType::LSU;       
-    pipeline_state->lsu.store = 1;
+    pipeline_state->lsu.type = LsuType::STORE;
     pipeline_state->used_iregs[rsrc0] = 1;
     pipeline_state->used_iregs[rsrc1] = 1;    
     if (func3 == 0x2) {
@@ -601,20 +602,20 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
         Word memAddr = rsdata[t][0] + immsrc;
         pipeline_state->mem_addrs.at(t) = memAddr;
         core_->dcache_write(memAddr, rsdata[t][1], 4);
-        D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+        DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
       }
     } else {      
       pipeline_state->mem_addrs.resize(vl_);
       for (int i = 0; i < vl_; i++) {
         Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
         pipeline_state->mem_addrs.at(i) = memAddr;
-        D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+        DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
         switch (instr.getVlsWidth()) {
         case 6: {
           //store word and unit strided (not checking for unit stride)          
           uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
           core_->dcache_write(memAddr, value, 4);
-          D(3, "store: " << memAddr << " value:" << value);
+          DP(3, "store: " << memAddr << " value:" << value);
         } break;
         default:
           std::abort();
@@ -705,9 +706,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
         } else {          
           // FMV.X.W
           rddata[t] = rsdata[t][0];
-          pipeline_state->fpu.type = FpuType::FNCP;
-          pipeline_state->used_fregs[rsrc0] = 1;
-        } 
+        }        
+        pipeline_state->fpu.type = FpuType::FNCP;
+        pipeline_state->used_fregs[rsrc0] = 1;
         break;
       case 0x50:             
         switch(func3) {              
@@ -783,132 +784,138 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
     }
     rd_write = true;
     break;
-  case GPGPU:
-    pipeline_state->exe_type = ExeType::GPU; 
+  case GPGPU: {
+    pipeline_state->exe_type = ExeType::GPU;
+    int ts = 0;
     for (int t = 0; t < num_threads; ++t) {
-      if (!tmask_.test(t))
-        continue;
-      switch (func3) {
-      case 0: {
-        // TMC        
-        pipeline_state->gpu.type = GpuType::TMC;
-        pipeline_state->used_iregs[rsrc0] = 1;
-        pipeline_state->stall_warp = true;
-        if (rsrc1) {
-          // predicate mode
-          ThreadMask pred;
-          for (int i = 0; i < num_threads; ++i) {
-            pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
-          }
-          if (pred.any()) {
-            tmask_ &= pred;
-          }
-        } else {
-          tmask_.reset();
-          for (int i = 0; i < num_threads; ++i) {
-            tmask_.set(i, rsdata.at(t)[0] & (1 << i));
-          }
-        }
-        D(3, "*** TMC " << tmask_);
-        active_ = tmask_.any();
-        break; // runOnce
-      } break;
-      case 1: {
-        // WSPAWN
-        pipeline_state->gpu.type = GpuType::WSPAWN;
-        pipeline_state->used_iregs[rsrc0] = 1;
-        pipeline_state->used_iregs[rsrc1] = 1;
-        pipeline_state->stall_warp = true;
-        int active_warps = std::min<int>(rsdata.at(t)[0], core_->arch().num_warps());
-        D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(t)[1]);
-        for (int i = 1; i < active_warps; ++i) {
-          Warp &newWarp = core_->warp(i);
-          newWarp.setPC(rsdata[t][1]);
-          newWarp.setTmask(0, true);
-        }
-        break; // runOnce
-      } break;
-      case 2: {
-        // SPLIT    
-        pipeline_state->gpu.type = GpuType::SPLIT;
-        pipeline_state->used_iregs[rsrc0] = 1;
-        pipeline_state->stall_warp = true;
-        if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {          
-          ThreadMask tmask;
-          for (int i = 0; i < num_threads; ++i) {
-            tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
-          }
-
-          DomStackEntry e(tmask, nextPC);
-          domStack_.push(tmask_);
-          domStack_.push(e);
-          for (size_t i = 0; i < e.tmask.size(); ++i) {
-            tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
-          }
-          active_ = tmask_.any();
-
-          DPH(3, "*** Split: New TM=");
-          for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
-          DPN(3, ", Pushed TM=");
-          for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1));
-          DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
-        } else {
-          D(3, "*** Unanimous pred");
-          DomStackEntry e(tmask_);
-          e.unanimous = true;
-          domStack_.push(e);
-        }        
-        break; // runOnce
-      } break;
-      case 3: {
-        // JOIN
-        pipeline_state->gpu.type = GpuType::JOIN;        
-        pipeline_state->stall_warp = true;        
-        if (!domStack_.empty() && domStack_.top().unanimous) {
-          D(3, "*** Uninimous branch at join");
-          tmask_ = domStack_.top().tmask;
-          active_ = tmask_.any();
-          domStack_.pop();
-        } else {
-          if (!domStack_.top().fallThrough) {
-            nextPC = domStack_.top().PC;
-            D(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
-          }
-
-          tmask_ = domStack_.top().tmask;
-          active_ = tmask_.any();
-
-          DPH(3, "*** Join: New TM=");
-          for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
-          DPN(3, "\n");
-
-          domStack_.pop();
-        }        
-        break; // runOnce
-      } break;
-      case 4: {
-        // BAR
-        pipeline_state->gpu.type = GpuType::BAR;
-        pipeline_state->used_iregs[rsrc0] = 1;
-        pipeline_state->used_iregs[rsrc1] = 1;
-        pipeline_state->stall_warp = true; 
-        active_ = false;
-        core_->barrier(rsdata[t][0], rsdata[t][1], id_);        
-        break; // runOnce
-      } break;
-      case 6: {
-        // PREFETCH
-        pipeline_state->exe_type = ExeType::LSU; 
-        pipeline_state->lsu.prefetch = 1; 
-        pipeline_state->used_iregs[rsrc0] = 1;
-        int addr = rsdata[t][0];
-        printf("*** PREFETCHED %d ***\n", addr);
-      } break;
-      default:
-        std::abort();
+      if (tmask_.test(t)) {
+        ts = t;
+        break;
       }
     }
-    break;
+    switch (func3) {
+    case 0: {
+      // TMC        
+      pipeline_state->gpu.type = GpuType::TMC;
+      pipeline_state->used_iregs[rsrc0] = 1;
+      pipeline_state->stall_warp = true;
+      if (rsrc1) {
+        // predicate mode
+        ThreadMask pred;
+        for (int i = 0; i < num_threads; ++i) {
+          pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
+        }
+        if (pred.any()) {
+          tmask_ &= pred;
+        }
+      } else {
+        tmask_.reset();
+        for (int i = 0; i < num_threads; ++i) {
+          tmask_.set(i, rsdata.at(ts)[0] & (1 << i));
+        }
+      }
+      DPH(3, "*** New TMC: ");
+      for (int i = 0; i < num_threads; ++i)
+        DPN(3, tmask_.test(num_threads-i-1));
+      DPN(3, std::endl);
+
+      active_ = tmask_.any();
+    } break;
+    case 1: {
+      // WSPAWN
+      pipeline_state->gpu.type = GpuType::WSPAWN;
+      pipeline_state->used_iregs[rsrc0] = 1;
+      pipeline_state->used_iregs[rsrc1] = 1;
+      pipeline_state->stall_warp = true;
+      int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
+      DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
+      for (int i = 1; i < active_warps; ++i) {
+        Warp &newWarp = core_->warp(i);
+        newWarp.setPC(rsdata[ts][1]);
+        newWarp.setTmask(0, true);
+      }
+    } break;
+    case 2: {
+      // SPLIT    
+      pipeline_state->gpu.type = GpuType::SPLIT;
+      pipeline_state->used_iregs[rsrc0] = 1;
+      pipeline_state->stall_warp = true;
+      if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {          
+        ThreadMask tmask;
+        for (int i = 0; i < num_threads; ++i) {
+          tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
+        }
+
+        DomStackEntry e(tmask, nextPC);
+        domStack_.push(tmask_);
+        domStack_.push(e);
+        for (size_t i = 0; i < e.tmask.size(); ++i) {
+          tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
+        }
+        active_ = tmask_.any();
+
+        DPH(3, "*** Split: New TM=");
+        for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
+        DPN(3, ", Pushed TM=");
+        for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1));
+        DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
+      } else {
+        DP(3, "*** Unanimous pred");
+        DomStackEntry e(tmask_);
+        e.unanimous = true;
+        domStack_.push(e);
+      }        
+    } break;
+    case 3: {
+      // JOIN
+      pipeline_state->gpu.type = GpuType::JOIN;        
+      pipeline_state->stall_warp = true;        
+      if (!domStack_.empty() && domStack_.top().unanimous) {
+        DP(3, "*** Uninimous branch at join");
+        tmask_ = domStack_.top().tmask;
+        active_ = tmask_.any();
+        domStack_.pop();
+      } else {
+        if (!domStack_.top().fallThrough) {
+          nextPC = domStack_.top().PC;
+          DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
+        }
+
+        tmask_ = domStack_.top().tmask;
+        active_ = tmask_.any();
+
+        DPH(3, "*** Join: New TM=");
+        for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
+        DPN(3, "\n");
+
+        domStack_.pop();
+      }        
+    } break;
+    case 4: {
+      // BAR
+      pipeline_state->gpu.type = GpuType::BAR;
+      pipeline_state->used_iregs[rsrc0] = 1;
+      pipeline_state->used_iregs[rsrc1] = 1;
+      pipeline_state->stall_warp = true; 
+      active_ = false;
+      core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); 
+    } break;
+    case 6: {
+      // PREFETCH
+      pipeline_state->exe_type = ExeType::LSU; 
+      pipeline_state->lsu.type = LsuType::PREFETCH; 
+      pipeline_state->used_iregs[rsrc0] = 1;
+      for (int t = 0; t < num_threads; ++t) {
+        if (!tmask_.test(t))
+          continue;
+        int addr = rsdata[t][0];
+        printf("*** PREFETCHED %d ***\n", addr);
+      }
+    } break;
+    default:
+      std::abort();
+    }
+    }  break;
   case VSET: {
     int VLEN = core_->arch().vsize() * 8;
     int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
@@ -928,7 +935,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
               uint8_t first  = *(uint8_t *)(vr1.data() + i);
               uint8_t second = *(uint8_t *)(vr2.data() + i);
               uint8_t result = first + second;
-              D(3, "Adding " << first << " + " << second << " = " << result);
+              DP(3, "Adding " << first << " + " << second << " = " << result);
               *(uint8_t *)(vd.data() + i) = result;
             }
           }
@@ -940,7 +947,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
               uint16_t first  = *(uint16_t *)(vr1.data() + i);
               uint16_t second = *(uint16_t *)(vr2.data() + i);
               uint16_t result = first + second;
-              D(3, "Adding " << first << " + " << second << " = " << result);
+              DP(3, "Adding " << first << " + " << second << " = " << result);
               *(uint16_t *)(vd.data() + i) = result;
             }
           }
@@ -952,7 +959,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
               uint32_t first  = *(uint32_t *)(vr1.data() + i);
               uint32_t second = *(uint32_t *)(vr2.data() + i);
               uint32_t result = first + second;
-              D(3, "Adding " << first << " + " << second << " = " << result);
+              DP(3, "Adding " << first << " + " << second << " = " << result);
               *(uint32_t *)(vd.data() + i) = result;
             }
           }
@@ -968,7 +975,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first  = *(uint8_t *)(vr1.data() + i);
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (first == second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 16) {
@@ -976,7 +983,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first  = *(uint16_t *)(vr1.data() + i);
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (first == second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 32) {
@@ -984,7 +991,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first  = *(uint32_t *)(vr1.data() + i);
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (first == second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
         }
@@ -999,7 +1006,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first  = *(uint8_t *)(vr1.data() + i);
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (first != second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 16) {
@@ -1007,7 +1014,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first  = *(uint16_t *)(vr1.data() + i);
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (first != second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 32) {
@@ -1015,7 +1022,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first  = *(uint32_t *)(vr1.data() + i);
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (first != second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
         }
@@ -1030,7 +1037,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first  = *(uint8_t *)(vr1.data() + i);
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 16) {
@@ -1038,7 +1045,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first  = *(uint16_t *)(vr1.data() + i);
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 32) {
@@ -1046,7 +1053,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first  = *(uint32_t *)(vr1.data() + i);
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
         }
@@ -1061,7 +1068,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int8_t first  = *(int8_t *)(vr1.data() + i);
             int8_t second = *(int8_t *)(vr2.data() + i);
             int8_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 16) {
@@ -1069,7 +1076,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int16_t first  = *(int16_t *)(vr1.data() + i);
             int16_t second = *(int16_t *)(vr2.data() + i);
             int16_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(int16_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 32) {
@@ -1077,7 +1084,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int32_t first  = *(int32_t *)(vr1.data() + i);
             int32_t second = *(int32_t *)(vr2.data() + i);
             int32_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(int32_t *)(vd.data() + i) = result;
           }
         }
@@ -1092,7 +1099,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first  = *(uint8_t *)(vr1.data() + i);
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 16) {
@@ -1100,7 +1107,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first  = *(uint16_t *)(vr1.data() + i);
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 32) {
@@ -1108,7 +1115,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first  = *(uint32_t *)(vr1.data() + i);
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
         }
@@ -1123,7 +1130,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int8_t first  = *(int8_t *)(vr1.data() + i);
             int8_t second = *(int8_t *)(vr2.data() + i);
             int8_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 16) {
@@ -1131,7 +1138,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int16_t first  = *(int16_t *)(vr1.data() + i);
             int16_t second = *(int16_t *)(vr2.data() + i);
             int16_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(int16_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 32) {
@@ -1139,7 +1146,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int32_t first  = *(int32_t *)(vr1.data() + i);
             int32_t second = *(int32_t *)(vr2.data() + i);
             int32_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(int32_t *)(vd.data() + i) = result;
           }
         }
@@ -1154,7 +1161,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first  = *(uint8_t *)(vr1.data() + i);
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 16) {
@@ -1162,7 +1169,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first  = *(uint16_t *)(vr1.data() + i);
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 32) {
@@ -1170,7 +1177,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first  = *(uint32_t *)(vr1.data() + i);
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
         }
@@ -1185,7 +1192,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int8_t first  = *(int8_t *)(vr1.data() + i);
             int8_t second = *(int8_t *)(vr2.data() + i);
             int8_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 16) {
@@ -1193,7 +1200,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int16_t first  = *(int16_t *)(vr1.data() + i);
             int16_t second = *(int16_t *)(vr2.data() + i);
             int16_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(int16_t *)(vd.data() + i) = result;
           }
         } else if (vtype_.vsew == 32) {
@@ -1201,7 +1208,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             int32_t first  = *(int32_t *)(vr1.data() + i);
             int32_t second = *(int32_t *)(vr2.data() + i);
             int32_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(int32_t *)(vd.data() + i) = result;
           }
         }
@@ -1222,7 +1229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first_value  = (first & 0x1);
             uint8_t second_value = (second & 0x1);
             uint8_t result = (first_value & !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }            
           for (int i = vl_; i < VLMAX; i++) {
@@ -1235,7 +1242,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first_value  = (first & 0x1);
             uint16_t second_value = (second & 0x1);
             uint16_t result = (first_value & !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1248,7 +1255,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first_value  = (first & 0x1);
             uint32_t second_value = (second & 0x1);
             uint32_t result = (first_value & !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1268,7 +1275,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first_value  = (first & 0x1);
             uint8_t second_value = (second & 0x1);
             uint8_t result = (first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1281,7 +1288,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first_value  = (first & 0x1);
             uint16_t second_value = (second & 0x1);
             uint16_t result = (first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1294,7 +1301,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first_value  = (first & 0x1);
             uint32_t second_value = (second & 0x1);
             uint32_t result = (first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1314,7 +1321,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first_value  = (first & 0x1);
             uint8_t second_value = (second & 0x1);
             uint8_t result = (first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1327,7 +1334,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first_value  = (first & 0x1);
             uint16_t second_value = (second & 0x1);
             uint16_t result = (first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1340,7 +1347,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first_value  = (first & 0x1);
             uint32_t second_value = (second & 0x1);
             uint32_t result = (first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1360,7 +1367,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first_value  = (first & 0x1);
             uint8_t second_value = (second & 0x1);
             uint8_t result = (first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1373,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first_value  = (first & 0x1);
             uint16_t second_value = (second & 0x1);
             uint16_t result = (first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1386,7 +1393,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first_value  = (first & 0x1);
             uint32_t second_value = (second & 0x1);
             uint32_t result = (first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1406,7 +1413,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first_value  = (first & 0x1);
             uint8_t second_value = (second & 0x1);
             uint8_t result = (first_value | !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1419,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first_value  = (first & 0x1);
             uint16_t second_value = (second & 0x1);
             uint16_t result = (first_value | !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1432,7 +1439,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first_value  = (first & 0x1);
             uint32_t second_value = (second & 0x1);
             uint32_t result = (first_value | !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1452,7 +1459,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first_value  = (first & 0x1);
             uint8_t second_value = (second & 0x1);
             uint8_t result = !(first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1465,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first_value  = (first & 0x1);
             uint16_t second_value = (second & 0x1);
             uint16_t result = !(first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1478,7 +1485,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first_value  = (first & 0x1);
             uint32_t second_value = (second & 0x1);
             uint32_t result = !(first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1498,7 +1505,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first_value  = (first & 0x1);
             uint8_t second_value = (second & 0x1);
             uint8_t result = !(first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1511,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first_value  = (first & 0x1);
             uint16_t second_value = (second & 0x1);
             uint16_t result = !(first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1524,7 +1531,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first_value  = (first & 0x1);
             uint32_t second_value = (second & 0x1);
             uint32_t result = !(first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1544,7 +1551,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first_value  = (first & 0x1);
             uint8_t second_value = (second & 0x1);
             uint8_t result = !(first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1557,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first_value  = (first & 0x1);
             uint16_t second_value = (second & 0x1);
             uint16_t result = !(first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1570,7 +1577,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first_value  = (first & 0x1);
             uint32_t second_value = (second & 0x1);
             uint32_t result = !(first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1588,7 +1595,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first  = *(uint8_t *)(vr1.data() + i);
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1599,7 +1606,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first  = *(uint16_t *)(vr1.data() + i);
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1610,7 +1617,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first  = *(uint32_t *)(vr1.data() + i);
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1628,7 +1635,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint8_t first  = *(uint8_t *)(vr1.data() + i);
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) += result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1639,7 +1646,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint16_t first  = *(uint16_t *)(vr1.data() + i);
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) += result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1650,7 +1657,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
             uint32_t first  = *(uint32_t *)(vr1.data() + i);
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) += result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1669,7 +1676,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
           for (int i = 0; i < vl_; i++) {
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (rsdata[i][0] + second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1679,7 +1686,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
           for (int i = 0; i < vl_; i++) {
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (rsdata[i][0] + second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1689,7 +1696,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
           for (int i = 0; i < vl_; i++) {
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (rsdata[i][0] + second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1705,7 +1712,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
           for (int i = 0; i < vl_; i++) {
             uint8_t second = *(uint8_t *)(vr2.data() + i);
             uint8_t result = (rsdata[i][0] * second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
             *(uint8_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1715,7 +1722,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
           for (int i = 0; i < vl_; i++) {
             uint16_t second = *(uint16_t *)(vr2.data() + i);
             uint16_t result = (rsdata[i][0] * second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
             *(uint16_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1725,7 +1732,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
           for (int i = 0; i < vl_; i++) {
             uint32_t second = *(uint32_t *)(vr2.data() + i);
             uint32_t result = (rsdata[i][0] * second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
             *(uint32_t *)(vd.data() + i) = result;
           }
           for (int i = vl_; i < VLMAX; i++) {
@@ -1741,7 +1748,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
       vtype_.vsew  = instr.getVsew();
       vtype_.vlmul = instr.getVlmul();
 
-      D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew  << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX);
+      DP(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew  << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX);
 
       int s0 = rsdata[0][0];
       if (s0 <= VLMAX) {
@@ -1762,46 +1769,49 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
   }
 
   if (rd_write) {
+    pipeline_state->wb = true;
     DPH(2, "Dest Reg: ");
-    int rdt = instr.getRDType();    
+    auto rdt = instr.getRDType();    
     switch (rdt) {
-    case 1:      
+    case RegType::Integer:      
       if (rdest) {    
-        DPH(2, "r" << std::dec << rdest << "={");    
+        DPN(2, "r" << std::dec << rdest << "={");    
         for (int t = 0; t < num_threads; ++t) {
-          if (!tmask_.test(t))
-            continue;
-          iRegFile_.at(t)[rdest] = rddata[t];  
           if (t) DPN(2, ", ");
+          if (!tmask_.test(t)) {
+            DPN(2, "-");
+            continue;            
+          }
+          iRegFile_.at(t)[rdest] = rddata[t];
           DPN(2, "0x" << std::hex << rddata[t]);         
         }
         DPN(2, "}" << std::endl);
         pipeline_state->used_iregs[rdest] = 1;
       }
       break;
-    case 2:
-      DPH(2, "fr" << std::dec << rdest << "={");
+    case RegType::Float:
+      DPN(2, "fr" << std::dec << rdest << "={");
       for (int t = 0; t < num_threads; ++t) {
-        if (!tmask_.test(t))
-          continue;
-        fRegFile_.at(t)[rdest] = rddata[t];        
         if (t) DPN(2, ", ");
+        if (!tmask_.test(t)) {
+          DPN(2, "-");
+          continue;            
+        }
+        fRegFile_.at(t)[rdest] = rddata[t];        
         DPN(2, "0x" << std::hex << rddata[t]);         
       }
       DPN(2, "}" << std::endl);
       pipeline_state->used_fregs[rdest] = 1;
       break;
-    case 3:
-      pipeline_state->used_vregs[rdest] = 1;
-      break;
     default:
+      std::abort();
       break;
     }
   }
 
   PC_ += core_->arch().wsize();
   if (PC_ != nextPC) {
-    D(3, "*** Next PC: " << std::hex << nextPC << std::dec);
+    DP(3, "*** Next PC: " << std::hex << nextPC << std::dec);
     PC_ = nextPC;
   }
 }
diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp
index 5cdf22f3..ba280812 100644
--- a/sim/simX/exeunit.cpp
+++ b/sim/simX/exeunit.cpp
@@ -9,6 +9,17 @@
 
 using namespace vortex;
 
+NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
+    
+void NopUnit::step(uint64_t /*cycle*/) {
+    pipeline_state_t state;
+    if (!inputs_.try_pop(&state))
+        return;
+    this->schedule_output(state, 1);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 LsuUnit::LsuUnit(Core* core) 
     : ExeUnit("LSU")
     , core_(core)
@@ -17,61 +28,77 @@ LsuUnit::LsuUnit(Core* core)
     , fence_lock_(false)
 {}
 
-void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) {
-    auto entry = pending_dcache_.at(response.tag);    
-    entry.second.reset(port_id); // track remaining blocks
-    if (!entry.second.any()) {        
-        auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
-        entry.first.dcache_latency = latency;
-        this->schedule_output(entry.first, 1);
-        pending_dcache_.release(response.tag);
-    }
-}
+void LsuUnit::step(uint64_t cycle) {
+    __unused (cycle);
+
+    // handle dcache response
+    for (uint32_t t = 0; t < num_threads_; ++t) {
+        MemRsp mem_rsp;
+        if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp))
+            continue;
+        auto& entry = pending_dcache_.at(mem_rsp.tag);  
+        DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first);  
+        assert(entry.second.test(t));
+        entry.second.reset(t); // track remaining blocks        
+        if (!entry.second.any()) {        
+            auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
+            entry.first.dcache_latency = latency;
+            this->schedule_output(entry.first, 1);
+            pending_dcache_.release(mem_rsp.tag);
+        }   
+    }
 
-void LsuUnit::step() {
     if (fence_lock_) {
         // wait for all pending memory operations to complete
         if (!pending_dcache_.empty())
             return;
         this->schedule_output(fence_state_, 1);
         fence_lock_ = false;
+        DT(3, cycle, "fence-unlock: " << fence_state_);
     }
 
+    // check input queue
     if (inputs_.empty())
         return;
 
     auto state = inputs_.top();
 
-    if (state.lsu.fence) {
+    if (state.lsu.type == LsuType::FENCE) {
         // schedule fence lock
         fence_state_ = state;
         fence_lock_ = true;
         inputs_.pop();
+        DT(3, cycle, "fence-lock: " << state);
         return;
     }
 
-    // send dcache requests
-    if (!pending_dcache_.full()) {   
-        state.dcache_latency = SimPlatform::instance().cycles();
-        auto tag = pending_dcache_.allocate({state, state.tmask});         
-        for (uint32_t t = 0; t < num_threads_; ++t) {
-            if (!state.tmask.test(t))
-                continue;
-            MemReq mem_req;
-            mem_req.addr  = state.mem_addrs.at(t);
-            mem_req.write = state.lsu.store;
-            mem_req.tag   = tag;
-            core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
-        }            
-        inputs_.pop();
+    // check pending queue capacity
+    if (pending_dcache_.full()) {
+        DT(3, cycle, "*** lsu-queue-stall: " << state);
+        return;
     }
+
+    // send dcache request 
+    state.dcache_latency = SimPlatform::instance().cycles();
+    auto tag = pending_dcache_.allocate({state, state.tmask});         
+    for (uint32_t t = 0; t < num_threads_; ++t) {
+        if (!state.tmask.test(t))
+            continue;
+        MemReq mem_req;
+        mem_req.addr  = state.mem_addrs.at(t);
+        mem_req.write = (state.lsu.type == LsuType::STORE);
+        mem_req.tag   = tag;
+        core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
+        DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state);
+    }            
+    inputs_.pop();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 
 AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
     
-void AluUnit::step() {
+void AluUnit::step(uint64_t /*cycle*/) {
     pipeline_state_t state;
     if (!inputs_.try_pop(&state))
         return;
@@ -95,7 +122,7 @@ void AluUnit::step() {
 
 CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
     
-void CsrUnit::step() {
+void CsrUnit::step(uint64_t /*cycle*/) {
     pipeline_state_t state;
     if (!inputs_.try_pop(&state))
         return;
@@ -106,7 +133,7 @@ void CsrUnit::step() {
 
 FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
     
-void FpuUnit::step() {
+void FpuUnit::step(uint64_t /*cycle*/) {
     pipeline_state_t state;
     if (!inputs_.try_pop(&state))
         return;
@@ -133,7 +160,7 @@ void FpuUnit::step() {
 
 GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
     
-void GpuUnit::step() {
+void GpuUnit::step(uint64_t /*cycle*/) {
     pipeline_state_t state;
     if (!inputs_.try_pop(&state))
         return;
diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h
index 915089d3..3b2bbf91 100644
--- a/sim/simX/exeunit.h
+++ b/sim/simX/exeunit.h
@@ -43,7 +43,16 @@ public:
         return outputs_.try_pop(state);
     }
 
-    virtual void step() = 0;
+    virtual void step(uint64_t cycle) = 0;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class NopUnit : public ExeUnit {
+public:
+    NopUnit(Core*);
+    
+    void step(uint64_t cycle);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -59,9 +68,7 @@ private:
 public:
     LsuUnit(Core*);
 
-    void handleCacheReponse(const MemRsp& response, uint32_t port_id);
-
-    void step();
+    void step(uint64_t cycle);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -70,7 +77,7 @@ class AluUnit : public ExeUnit {
 public:
     AluUnit(Core*);
     
-    void step();
+    void step(uint64_t cycle);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -79,7 +86,7 @@ class CsrUnit : public ExeUnit {
 public:
     CsrUnit(Core*);
     
-    void step();
+    void step(uint64_t cycle);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -88,7 +95,7 @@ class FpuUnit : public ExeUnit {
 public:
     FpuUnit(Core*);
     
-    void step();
+    void step(uint64_t cycle);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -97,7 +104,7 @@ class GpuUnit : public ExeUnit {
 public:
     GpuUnit(Core*);
     
-    void step();
+    void step(uint64_t cycle);
 };
 
 }
\ No newline at end of file
diff --git a/sim/simX/instr.h b/sim/simX/instr.h
index 1a205478..5deace6c 100644
--- a/sim/simX/instr.h
+++ b/sim/simX/instr.h
@@ -53,22 +53,23 @@ public:
     : opcode_(Opcode::NOP)
     , num_rsrcs_(0)
     , has_imm_(false)
+    , rdest_type_(RegType::None)
     , rdest_(0)
     , func3_(0)
     , func7_(0) {
     for (int i = 0; i < MAX_REG_SOURCES; ++i) {
-       rsrc_type_[i] = 0;
+       rsrc_type_[i] = RegType::None;
     }
   }
 
   /* Setters used to "craft" the instruction. */
   void setOpcode(Opcode opcode)  { opcode_ = opcode; }
-  void setDestReg(int destReg) { rdest_type_ = 1; rdest_ = destReg; }
-  void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = 1; rsrc_[num_rsrcs_++] = srcReg; }
-  void setDestFReg(int destReg) { rdest_type_ = 2; rdest_ = destReg; }
-  void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = 2; rsrc_[num_rsrcs_++] = srcReg;  }
-  void setDestVReg(int destReg) { rdest_type_ = 3; rdest_ = destReg; }
-  void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = 3; rsrc_[num_rsrcs_++] = srcReg;  }
+  void setDestReg(int destReg) { rdest_type_ = RegType::Integer; rdest_ = destReg; }
+  void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Integer; rsrc_[num_rsrcs_++] = srcReg; }
+  void setDestFReg(int destReg) { rdest_type_ = RegType::Float; rdest_ = destReg; }
+  void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg;  }
+  void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
+  void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg;  }
   void setFunc3(Word func3) { func3_ = func3; }
   void setFunc7(Word func7) { func7_ = func7; }
   void setImm(Word imm) { has_imm_ = true; imm_ = imm; }
@@ -89,9 +90,9 @@ public:
   Word getFunc7() const { return func7_; }
   int getNRSrc() const { return num_rsrcs_; }
   int getRSrc(int i) const { return rsrc_[i]; }
-  int getRSType(int i) const { return rsrc_type_[i]; }
+  RegType getRSType(int i) const { return rsrc_type_[i]; }
   int getRDest() const { return rdest_; }  
-  int getRDType() const { return rdest_type_; }  
+  RegType getRDType() const { return rdest_type_; }  
   bool hasImm() const { return has_imm_; }
   Word getImm() const { return imm_; }
   Word getVlsWidth() const { return vlsWidth_; }
@@ -112,15 +113,15 @@ private:
   Opcode opcode_;
   int num_rsrcs_;
   bool has_imm_;
-  int rdest_type_;
+  RegType rdest_type_;
   Word imm_;
-  int rsrc_type_[MAX_REG_SOURCES];
+  RegType rsrc_type_[MAX_REG_SOURCES];
   int rsrc_[MAX_REG_SOURCES];  
   int rdest_;
   Word func3_;
   Word func6_;
 
-  //Vector
+  // Vector
   Word vmask_;
   Word vlsWidth_;
   Word vMop_;
diff --git a/sim/simX/main.cpp b/sim/simX/main.cpp
index a34ada0e..a0e07faf 100644
--- a/sim/simX/main.cpp
+++ b/sim/simX/main.cpp
@@ -6,12 +6,15 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #include "processor.h"
+#include <util.h>
 #include "args.h"
 
+#define RAM_PAGE_SIZE 4096
+
 using namespace vortex;
 
 int main(int argc, char **argv) {
-  int ret;
+  int exitcode;
 
   std::string archStr("rv32imf");
   std::string imgFileName;
@@ -53,11 +56,42 @@ int main(int argc, char **argv) {
 
   {
     ArchDef arch(archStr, num_cores, num_warps, num_threads);
+
     Processor processor(arch);
-    ret = processor.run(imgFileName, riscv_test, showStats);
+
+    RAM ram(RAM_PAGE_SIZE);
+
+    {
+      std::string program_ext(fileExtension(imgFileName.c_str()));
+      if (program_ext == "bin") {
+        ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
+      } else if (program_ext == "hex") {
+        ram.loadHexImage(imgFileName.c_str());
+      } else {
+        std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
+        return -1;
+      }
+    }
+
+    processor.attach_ram(&ram);
+
+    exitcode = processor.run();
+
+    if (riscv_test) {
+      if (1 == exitcode) {
+        std::cout << "Passed." << std::endl;
+        exitcode = 0;
+      } else {
+        std::cout << "Failed." << std::endl;
+      }
+    } else {
+      if (exitcode != 0) {
+        std::cout << "*** error: exitcode=" << exitcode << std::endl;
+      }
+    }
   }  
 
   SimPlatform::instance().finalize();
 
-  return ret;
+  return exitcode;
 }
diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp
index c377972d..63ba571a 100644
--- a/sim/simX/memsim.cpp
+++ b/sim/simX/memsim.cpp
@@ -8,32 +8,26 @@ using namespace vortex;
 class MemSim::Impl {
 private:
     MemSim* simobject_;
-    std::vector<std::queue<MemReq>> inputs_;
+    uint32_t num_banks_;
     uint32_t latency_;
 
 public:
     Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) 
         : simobject_(simobject)
-        , inputs_(num_banks)
+        , num_banks_(num_banks)
         , latency_(latency)  
     {}
 
-    void handleMemRequest(const MemReq& mem_req, uint32_t port_id) {
-        inputs_.at(port_id).push(mem_req);        
-    }
-
     void step(uint64_t /*cycle*/) {
-        for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) {
-            auto& queue = inputs_.at(i);            
-            if (queue.empty())
+        for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
+            MemReq mem_req;     
+            if (!simobject_->MemReqPorts.at(i).read(&mem_req))
                 continue;
-            auto& entry = queue.front();
-            if (!entry.write) {
+            if (!mem_req.write) {
                 MemRsp mem_rsp;
-                mem_rsp.tag = entry.tag;
+                mem_rsp.tag = mem_req.tag;
                 simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
             }
-            queue.pop();
         }
     }
 };
@@ -45,7 +39,7 @@ MemSim::MemSim(const SimContext& ctx,
                uint32_t latency) 
     : SimObject<MemSim>(ctx, "MemSim")
     , impl_(new Impl(this, num_banks, latency))
-    , MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest}) 
+    , MemReqPorts(num_banks, this) 
     , MemRspPorts(num_banks, this)
 {}
 
diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h
index 82735c2a..b5937b29 100644
--- a/sim/simX/pipeline.h
+++ b/sim/simX/pipeline.h
@@ -10,14 +10,19 @@
 namespace vortex {
 
 struct pipeline_state_t {
-  //--    
+  //--
+  uint64_t    id;
+  
+  //--
+  int         cid;
   int         wid;  
   ThreadMask  tmask;
   Word        PC;
 
   //--
   bool        stall_warp;
-  int         rdest_type;
+  bool        wb;  
+  RegType     rdest_type;
   int         rdest;
   RegMask     used_iregs;
   RegMask     used_fregs;
@@ -30,10 +35,7 @@ struct pipeline_state_t {
   //--
   union {
     struct {        
-      uint8_t load : 1;
-      uint8_t store: 1;
-      uint8_t fence : 1;
-      uint8_t prefetch: 1;
+      LsuType type;
     } lsu;
     struct {
       AluType type;
@@ -49,8 +51,37 @@ struct pipeline_state_t {
   // stats
   uint64_t icache_latency;
   uint64_t dcache_latency;
+
+  void clear() {
+    cid = 0;
+    wid = 0;
+    tmask.reset();
+    PC = 0;
+    stall_warp = false;
+    wb = false;
+    rdest = 0;
+    rdest_type = RegType::None;
+    used_iregs.reset();
+    used_fregs.reset();
+    used_vregs.reset();
+    exe_type = ExeType::NOP;
+    mem_addrs.clear();    
+    icache_latency = 0;
+    dcache_latency = 0;
+  }
 };
 
+inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
+  os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
+  os << ", wb=" << state.wb;
+  if (state.wb) {
+     os << ", rd=" << state.rdest_type << std::dec << state.rdest;
+  }
+  os << ", ex=" << state.exe_type;
+  os << " (#" << std::dec << state.id << ")";
+  return os;
+}
+
 class PipelineStage : public Queue<pipeline_state_t> {
 protected:
   const char* name_;
@@ -62,15 +93,4 @@ public:
   {}
 };
 
-inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
-  os << "stall_warp="   << state.stall_warp;
-  os << ", wid="        << state.wid;
-  os << ", PC="         << std::hex << state.PC;
-  os << ", used_iregs=" << state.used_iregs;
-  os << ", used_fregs=" << state.used_fregs;
-  os << ", used_vregs=" << state.used_vregs;
-  os << std::endl;
-  return os;
-}
-
 }
\ No newline at end of file
diff --git a/sim/simX/processor.cpp b/sim/simX/processor.cpp
new file mode 100644
index 00000000..be5cd4f4
--- /dev/null
+++ b/sim/simX/processor.cpp
@@ -0,0 +1,141 @@
+#include "processor.h"
+#include "constants.h"
+
+using namespace vortex;
+
+Processor::Processor(const ArchDef& arch) 
+  : cores_(arch.num_cores())
+  , l2caches_(NUM_CLUSTERS)
+  , l2_mem_switches_(NUM_CLUSTERS)
+{
+  uint32_t num_cores = arch.num_cores();
+  uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; 
+
+  // create cores
+  for (uint32_t i = 0; i < num_cores; ++i) {
+      cores_.at(i) = Core::Create(arch, i);
+  }
+
+  // connect memory sub-systen
+  memsim_ = MemSim::Create(1, MEM_LATENCY);
+  std::vector<SlavePort<MemReq>*>  mem_req_ports(1); 
+  std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
+  mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
+  mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
+
+  if (L3_ENABLE) {
+    l3cache_ = Cache::Create("l3cache", CacheConfig{
+      log2ceil(L3_CACHE_SIZE),  // C
+      log2ceil(MEM_BLOCK_SIZE), // B
+      2,                      // W
+      0,                      // A
+      32,                    // address bits    
+      L3_NUM_BANKS,           // number of banks
+      L3_NUM_PORTS,           // number of ports
+      NUM_CLUSTERS,           // request size   
+      true,                   // write-throught
+      0,                      // victim size
+      L3_MSHR_SIZE,           // mshr
+      2,                      // pipeline latency
+      }
+    );
+      
+    mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
+    l3cache_->MemReqPort.bind(mem_req_ports.at(0));
+
+    mem_req_ports.resize(NUM_CLUSTERS);
+    mem_rsp_ports.resize(NUM_CLUSTERS);
+    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+      mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
+      mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
+    }
+  } else if (NUM_CLUSTERS > 1) {
+    l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
+    mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
+    l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
+
+    mem_req_ports.resize(NUM_CLUSTERS);
+    mem_rsp_ports.resize(NUM_CLUSTERS);
+    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+      mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
+      mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
+    }
+  }
+
+  for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {      
+    if (L2_ENABLE) {
+      auto& l2cache = l2caches_.at(i);
+      l2cache = Cache::Create("l2cache", CacheConfig{
+        log2ceil(L2_CACHE_SIZE),  // C
+        log2ceil(MEM_BLOCK_SIZE), // B
+        2,                      // W
+        0,                      // A
+        32,                     // address bits    
+        L2_NUM_BANKS,           // number of banks
+        L2_NUM_PORTS,           // number of ports
+        NUM_CORES,              // request size   
+        true,                   // write-throught
+        0,                      // victim size
+        L2_MSHR_SIZE,           // mshr
+        2,                      // pipeline latency
+      });
+      mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
+      l2cache->MemReqPort.bind(mem_req_ports.at(i));
+
+      mem_req_ports.resize(cores_per_cluster);
+      mem_rsp_ports.resize(cores_per_cluster);
+      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+        mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
+        mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
+      }
+    } else if (cores_per_cluster > 1) {
+      auto& l2_mem_switch = l2_mem_switches_.at(i);
+      l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
+      mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
+      l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));  
+
+      mem_req_ports.resize(cores_per_cluster);
+      mem_rsp_ports.resize(cores_per_cluster);
+      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+        mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
+        mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
+      }
+    }
+
+    for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+      auto& core = cores_.at((i * NUM_CLUSTERS) + j);        
+      mem_rsp_ports.at(i)->bind(&core->MemRspPort);
+      core->MemReqPort.bind(mem_req_ports.at(j));
+    }
+  }
+}
+
+void Processor::attach_ram(RAM* ram) {
+  for (auto core : cores_) {
+    core->attach_ram(ram);
+  }
+}
+
+Processor::~Processor() {}
+
+int Processor::run() {
+  bool running;
+  int exitcode = 0;
+  do {
+    SimPlatform::instance().step();
+    
+    running = false;
+    for (auto& core : cores_) {
+      if (core->running()) {
+        running = true;
+      }
+      if (core->check_ebreak()) {
+        exitcode = core->getIRegValue(3);
+        running = false;
+        break;
+      }
+    }
+  } while (running);
+
+  return exitcode;
+}
\ No newline at end of file
diff --git a/sim/simX/processor.h b/sim/simX/processor.h
index 50671953..e41fd740 100644
--- a/sim/simX/processor.h
+++ b/sim/simX/processor.h
@@ -1,189 +1,27 @@
 #pragma once
 
-#include "constants.h"
-#include "debug.h"
-#include "types.h"
 #include "core.h"
 
 namespace vortex {
 
 class Processor {
+public:
+  typedef std::shared_ptr<Processor> Ptr;
+  
+  Processor(const ArchDef& arch);
+  ~Processor();
+
+  void attach_ram(RAM* mem);
+
+  int run();
+
 private:
-  ArchDef arch_; 
-  Decoder decoder_;
-  MemoryUnit mu_;
-  RAM ram_;
   std::vector<Core::Ptr> cores_;  
   std::vector<Cache::Ptr> l2caches_;  
   std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
   Cache::Ptr l3cache_;
   Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
   MemSim::Ptr memsim_;
-
-public:
-  Processor(const ArchDef& arch) 
-    : arch_(arch)
-    , decoder_(arch)
-    , mu_(0, arch.wsize(), true)
-    , ram_((1<<12), (1<<20)) 
-    , cores_(arch.num_cores())
-    , l2caches_(NUM_CLUSTERS)
-    , l2_mem_switches_(NUM_CLUSTERS)
-  {
-    uint32_t num_cores = arch.num_cores();
-    uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
-    
-    // bind RAM to memory unit
-    mu_.attach(ram_, 0, 0xFFFFFFFF);    
-
-    // create cores
-    for (uint32_t i = 0; i < num_cores; ++i) {
-      cores_.at(i) = Core::Create(arch, decoder_, mu_, i);
-    }
-    
-    // connect memory sub-systen
-    memsim_ = MemSim::Create(1, MEM_LATENCY);
-    std::vector<SlavePort<MemReq>*>  mem_req_ports(1); 
-    std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
-    mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
-    mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
-
-    if (L3_ENABLE) {
-      l3cache_ = Cache::Create("l3cache", CacheConfig{
-        log2ceil(L3_CACHE_SIZE),  // C
-        log2ceil(MEM_BLOCK_SIZE), // B
-        2,                      // W
-        0,                      // A
-        32,                    // address bits    
-        L3_NUM_BANKS,           // number of banks
-        L3_NUM_PORTS,           // number of ports
-        NUM_CLUSTERS,           // request size   
-        true,                   // write-throught
-        0,                      // victim size
-        L3_MSHR_SIZE,           // mshr
-        2,                      // pipeline latency
-      });
-      mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
-      l3cache_->MemReqPort.bind(mem_req_ports.at(0));
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
-        mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
-      }
-    } else if (NUM_CLUSTERS > 1) {
-      l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
-      mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
-      l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
-        mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
-      }
-    }
-
-    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {      
-      if (L2_ENABLE) {
-        auto& l2cache = l2caches_.at(i);
-        l2cache = Cache::Create("l2cache", CacheConfig{
-          log2ceil(L2_CACHE_SIZE),  // C
-          log2ceil(MEM_BLOCK_SIZE), // B
-          2,                      // W
-          0,                      // A
-          32,                     // address bits    
-          L2_NUM_BANKS,           // number of banks
-          L2_NUM_PORTS,           // number of ports
-          NUM_CORES,              // request size   
-          true,                   // write-throught
-          0,                      // victim size
-          L2_MSHR_SIZE,           // mshr
-          2,                      // pipeline latency
-        });
-        mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
-        l2cache->MemReqPort.bind(mem_req_ports.at(i));
-
-        mem_req_ports.resize(cores_per_cluster);
-        mem_rsp_ports.resize(cores_per_cluster);
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
-          mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
-        }
-      } else if (cores_per_cluster > 1) {
-        auto& l2_mem_switch = l2_mem_switches_.at(i);
-        l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
-        mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
-        l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));  
-
-        mem_req_ports.resize(cores_per_cluster);
-        mem_rsp_ports.resize(cores_per_cluster);
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
-          mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
-        }
-      }
-
-      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-        auto& core = cores_.at((i * NUM_CLUSTERS) + j);        
-        mem_rsp_ports.at(i)->bind(&core->MemRspPort);
-        core->MemReqPort.bind(mem_req_ports.at(j));
-      }
-    }
-  }
-
-  ~Processor() {}
-
-  int run(const std::string& program, bool riscv_test, bool /*showStats*/) {
-    {
-      std::string program_ext(fileExtension(program.c_str()));
-      if (program_ext == "bin") {
-        ram_.loadBinImage(program.c_str(), STARTUP_ADDR);
-      } else if (program_ext == "hex") {
-        ram_.loadHexImage(program.c_str());
-      } else {
-        std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
-        return -1;
-      }
-    }
-
-    bool running;
-    int exitcode = 0;
-    do {
-      SimPlatform::instance().step();
-      
-      running = false;
-      for (auto& core : cores_) {
-        if (core->running()) {
-          running = true;
-        }
-        if (core->check_ebreak()) {
-          exitcode = core->getIRegValue(3);
-          running = false;
-          break;
-        }
-      }
-    } while (running);
-
-    // get error status
-
-    if (riscv_test) {
-      if (1 == exitcode) {
-        std::cout << "Passed." << std::endl;
-        exitcode = 0;
-      } else {
-        std::cout << "Failed." << std::endl;
-      }
-    } else {
-      if (exitcode != 0) {
-        std::cout << "*** error: exitcode=" << exitcode << std::endl;
-      }
-    }
-
-    return exitcode;
-  }
-
 };
 
 }
\ No newline at end of file
diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h
index 0e0e0577..46bf3bdc 100644
--- a/sim/simX/scoreboard.h
+++ b/sim/simX/scoreboard.h
@@ -10,6 +10,7 @@ private:
     std::vector<RegMask> in_use_iregs_;
     std::vector<RegMask> in_use_fregs_;
     std::vector<RegMask> in_use_vregs_;
+    std::unordered_map<uint32_t, uint64_t> owners_; 
 
 public:    
     Scoreboard(const ArchDef &arch) 
@@ -29,42 +30,87 @@ public:
             || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0
             || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0;
     }
+
+    std::vector<uint64_t> owners(const pipeline_state_t& state) const {
+        std::vector<uint64_t> out;        
+        {
+            uint32_t r = 0;
+            auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid);        
+            while (used_iregs.any()) {
+                if (used_iregs.test(0)) {
+                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer;
+                    out.push_back(owners_.at(tag));
+                }
+                used_iregs >>= 1;
+                ++r;
+            }
+        }
+        {
+            uint32_t r = 0;
+            auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid);
+            while (used_fregs.any()) {
+                if (used_fregs.test(0)) {
+                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float;
+                    out.push_back(owners_.at(tag));
+                }
+                used_fregs >>= 1;
+                ++r;
+            }
+        }
+        {
+            uint32_t r = 0;
+            auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid);
+            while (used_vregs.any()) {
+                if (used_vregs.test(0)) {
+                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector;
+                    out.push_back(owners_.at(tag));
+                }
+                used_vregs >>= 1;
+                ++r;
+            }
+        }
+        return std::move(out);
+    }
     
     void reserve(const pipeline_state_t& state) {
-        if (!state.rdest)
-            return;
-        
+        if (!state.wb)
+            return;  
         switch (state.rdest_type) {
-        case 1:            
+        case RegType::Integer:            
             in_use_iregs_.at(state.wid).set(state.rdest);
             break;
-        case 2:
+        case RegType::Float:
             in_use_fregs_.at(state.wid).set(state.rdest);
             break;
-        case 3:
+        case RegType::Vector:
             in_use_vregs_.at(state.wid).set(state.rdest);
             break;
         default:  
             break;
-        }
+        }      
+        uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
+        assert(owners_.count(tag) == 0);
+        owners_[tag] = state.id;
     }
 
     void release(const pipeline_state_t& state) {
-        if (!state.rdest)
-            return;
+        if (!state.wb)
+            return;       
         switch (state.rdest_type) {
-        case 1:
+        case RegType::Integer:
             in_use_iregs_.at(state.wid).reset(state.rdest);
             break;
-        case 2:
+        case RegType::Float:
             in_use_fregs_.at(state.wid).reset(state.rdest);
             break;
-        case 3:
+        case RegType::Vector:
             in_use_vregs_.at(state.wid).reset(state.rdest);
             break;
         default:  
             break;
         }      
+        uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
+        owners_.erase(tag);
     }
 };
 
diff --git a/sim/simX/types.h b/sim/simX/types.h
index 3dabfe3e..f53c3754 100644
--- a/sim/simX/types.h
+++ b/sim/simX/types.h
@@ -4,6 +4,7 @@
 #include <bitset>
 #include <queue>
 #include <unordered_map>
+#include <util.h>
 #include <VX_config.h>
 #include <simobject.h>
 
@@ -20,7 +21,25 @@ typedef std::bitset<32> RegMask;
 typedef std::bitset<32> ThreadMask;
 typedef std::bitset<32> WarpMask;
 
+enum class RegType {
+  None,
+  Integer,
+  Float,
+  Vector
+};
+
+inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
+  switch (type) {
+  case RegType::None: break;
+  case RegType::Integer: os << "r"; break;
+  case RegType::Float:   os << "fr"; break;
+  case RegType::Vector:  os << "vr"; break;
+  }
+  return os;
+}
+
 enum class ExeType {
+  NOP,
   ALU,
   LSU,
   CSR,
@@ -29,6 +48,19 @@ enum class ExeType {
   MAX,
 };
 
+inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
+  switch (type) {
+  case ExeType::NOP: os << "NOP"; break;
+  case ExeType::ALU: os << "ALU"; break;
+  case ExeType::LSU: os << "LSU"; break;
+  case ExeType::CSR: os << "CSR"; break;
+  case ExeType::FPU: os << "FPU"; break;
+  case ExeType::GPU: os << "GPU"; break;
+  case ExeType::MAX: break;
+  }
+  return os;
+}
+
 enum class AluType {
   ARITH,
   BRANCH,
@@ -36,6 +68,33 @@ enum class AluType {
   IDIV,    
 };
 
+inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
+  switch (type) {
+  case AluType::ARITH:  os << "ARITH"; break;
+  case AluType::BRANCH: os << "BRANCH"; break;
+  case AluType::IMUL:   os << "IMUL"; break;
+  case AluType::IDIV:   os << "IDIV"; break;
+  }
+  return os;
+}
+
+enum class LsuType {
+  LOAD,
+  STORE,
+  FENCE,
+  PREFETCH,    
+};
+
+inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
+  switch (type) {
+  case LsuType::LOAD:     os << "LOAD"; break;
+  case LsuType::STORE:    os << "STORE"; break;
+  case LsuType::FENCE:    os << "FENCE"; break;
+  case LsuType::PREFETCH: os << "PREFETCH"; break;
+  }
+  return os;
+}
+
 enum class FpuType {
   FNCP,
   FMA,
@@ -44,6 +103,17 @@ enum class FpuType {
   FCVT,
 };
 
+inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
+  switch (type) {
+  case FpuType::FNCP:  os << "FNCP"; break;
+  case FpuType::FMA:   os << "FMA"; break;
+  case FpuType::FDIV:  os << "FDIV"; break;
+  case FpuType::FSQRT: os << "FSQRT"; break;
+  case FpuType::FCVT:  os << "FCVT"; break;
+  }
+  return os;
+}
+
 enum class GpuType {
   TMC,
   WSPAWN,
@@ -53,11 +123,31 @@ enum class GpuType {
   TEX,
 };
 
+inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
+  switch (type) {
+  case GpuType::TMC:    os << "TMC"; break;
+  case GpuType::WSPAWN: os << "WSPAWN"; break;
+  case GpuType::SPLIT:  os << "SPLIT"; break;
+  case GpuType::JOIN:   os << "JOIN"; break;
+  case GpuType::BAR:    os << "BAR"; break;
+  case GpuType::TEX:    os << "TEX"; break;
+  }
+  return os;
+}
+
 enum class ArbiterType {
   Priority,
   RoundRobin
 };
 
+inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
+  switch (type) {
+  case ArbiterType::Priority:   os << "Priority"; break;
+  case ArbiterType::RoundRobin: os << "RoundRobin"; break;
+  }
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename T>
@@ -65,6 +155,8 @@ class Queue {
 protected:
   std::queue<T> queue_;
 
+  uint32_t count;
+
 public:
   Queue() {}
 
@@ -77,6 +169,7 @@ public:
   }
 
   void push(const T& value) {
+    ++count;
     queue_.push(value);
   }
 
@@ -141,6 +234,7 @@ public:
         return i;
       }
     }
+    assert(false);
     return -1;
   }
 
@@ -148,6 +242,7 @@ public:
     auto& entry = entries_.at(index);
     assert(entry.first);
     entry.first = false;
+    --capacity_;
   }
 
   void remove(uint32_t index, T* value) {
@@ -155,6 +250,7 @@ public:
     assert(entry.first);
     *value = entry.second;
     entry.first = false;
+    --capacity_;
   }
 };
 
@@ -163,29 +259,21 @@ public:
 template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
 class Switch : public SimObject<Switch<Req, Rsp>> {
 private:
-  struct req_t {  
+  struct req_batch_t {  
     std::vector<Req>       data;
     std::bitset<MaxInputs> valid;
-    req_t() {} 
-    req_t(uint32_t size) : data(size) {} 
+    req_batch_t() {} 
+    req_batch_t(uint32_t size) 
+      : data(size)
+      , valid(0)
+    {} 
   };
 
-  void handleIncomingRequest(const Req& req, uint32_t port_id) {
-    cur_req_.data.at(port_id) = req;
-    cur_req_.valid.set(port_id);
-  }
-
-  void handleIncomingResponse(const Rsp& rsp, uint32_t) {
-    rsps_.push(rsp);
-  }
-
   ArbiterType type_;
-  std::queue<req_t> reqs_;
-  std::queue<Rsp> rsps_;
-  req_t cur_req_; 
+  std::queue<req_batch_t> reqq_;
   uint32_t delay_;  
   uint32_t cursor_;
-  std::unordered_map<uint32_t, uint32_t> addr_table_;
+  uint32_t tag_shift_;
 
 public:
   Switch(
@@ -197,12 +285,12 @@ public:
   ) 
     : SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)    
     , type_(type)
-    , cur_req_(num_inputs)
     , delay_(delay)
     , cursor_(0)
-    , ReqIn(num_inputs, {this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingRequest})
+    , tag_shift_(log2ceil(num_inputs))
+    , ReqIn(num_inputs, this)
     , ReqOut(this)
-    , RspIn(this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingResponse)    
+    , RspIn(this)    
     , RspOut(num_inputs, this)
   {
     assert(delay_ != 0);
@@ -210,36 +298,52 @@ public:
   }
 
   void step(uint64_t /*cycle*/) {    
-    if (cur_req_.valid.any()) {
-      reqs_.push(cur_req_);      
-      cur_req_.valid.reset();
-    }
-
-    while (!reqs_.empty()) {
-      auto& entry = reqs_.front();
-      bool found = false;
-      for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) {
-        auto j = (cursor_ + i) % n;        
-        if (entry.valid.test(j)) {
-          auto& req = entry.data.at(j);
-          addr_table_[req.tag] = j;
-          ReqOut.send(req, delay_);
-          entry.valid.reset(j);
-          this->update_cursor(j);
-          found = true;
-          break;
+    // process incomming requests
+    {
+      req_batch_t req_batch(ReqIn.size());
+      for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
+        Req req;
+        if (ReqIn.at(i).read(&req)) {
+          req_batch.data.at(i) = req;
+          req_batch.valid.set(i);
         }
       }
-      if (found)
-        break;
-      reqs_.pop();
+      if (req_batch.valid.any()) {
+        reqq_.push(req_batch);
+      }
+    }
+
+    // apply arbitration
+    if (!reqq_.empty()) {
+      auto& req_batch = reqq_.front();
+      for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) {
+        auto j = (cursor_ + i) % n;        
+        if (req_batch.valid.test(j)) {
+          auto& req = req_batch.data.at(j);
+          if (tag_shift_) {
+            req.tag = (req.tag << tag_shift_) | j;
+          }
+          ReqOut.send(req, delay_);
+          req_batch.valid.reset(j);
+          this->update_cursor(j);
+          if (!req_batch.valid.any())
+            reqq_.pop(); // pop when empty
+          break;
+        }
+      }      
     } 
 
-    if (!rsps_.empty()) {
-      auto& rsp = rsps_.front();
-      auto port_id = addr_table_.at(rsp.tag);
-      RspOut.at(port_id).send(rsp, 1);
-      rsps_.pop();
+    // process incoming reponses
+    {
+      Rsp rsp;
+      if (RspIn.read(&rsp)) {    
+        uint32_t port_id = 0;
+        if (tag_shift_) {
+          port_id = rsp.tag & ((1 << tag_shift_)-1);
+          rsp.tag >>= tag_shift_;
+        }      
+        RspOut.at(port_id).send(rsp, 1);
+      }
     }
   }
 
diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp
index 0c989d0c..89b9cc39 100644
--- a/sim/simX/warp.cpp
+++ b/sim/simX/warp.cpp
@@ -24,30 +24,34 @@ Warp::Warp(Core *core, Word id)
 void Warp::eval(pipeline_state_t *pipeline_state) {
   assert(tmask_.any());
 
-  DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask=");
+  DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
   for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
     DPN(2, tmask_.test(n-i-1));
-  DPN(2, "\n");
+  DPN(2, ", PC=0x" << std::hex << PC_ << std::endl);
 
   /* Fetch and decode. */    
 
-  Word fetched = core_->icache_fetch(PC_);
-  auto instr = core_->decoder().decode(fetched, PC_);
+  Word instr_code = core_->icache_read(PC_, sizeof(Word));
+  auto instr = core_->decoder().decode(instr_code);
+  if (!instr) {
+    std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
+    std::abort();
+  }  
+
+  DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
 
   // Update state
+  pipeline_state->cid   = core_->id();
   pipeline_state->wid   = id_;
   pipeline_state->PC    = PC_;
   pipeline_state->tmask = tmask_;
   pipeline_state->rdest = instr->getRDest();
   pipeline_state->rdest_type = instr->getRDType();
-  pipeline_state->used_iregs.reset();
-  pipeline_state->used_fregs.reset();
-  pipeline_state->used_vregs.reset();
-  
+    
   // Execute
   this->execute(*instr, pipeline_state);
 
-  D(4, "Register state:");
+  DP(4, "Register state:");
   for (int i = 0; i < core_->arch().num_regs(); ++i) {
     DPN(4, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
     for (int j = 0; j < core_->arch().num_threads(); ++j) {
diff --git a/sim/vlsim/opae_sim.cpp b/sim/vlsim/opae_sim.cpp
index ced1e233..5da617b5 100644
--- a/sim/vlsim/opae_sim.cpp
+++ b/sim/vlsim/opae_sim.cpp
@@ -44,6 +44,8 @@
 #define VERILATOR_RESET_VALUE 2
 #endif
 
+#define RAM_PAGE_SIZE 4096
+
 using namespace vortex;
 
 static uint64_t timestamp = 0;
@@ -136,7 +138,7 @@ opae_sim::opae_sim()
   : stop_(false)
   , host_buffer_ids_(0) {  
   vl_obj_ = new VL_OBJ();
-  ram_ = new RAM((1<<12), (1<<20));
+  ram_ = new RAM(RAM_PAGE_SIZE);
 
   // reset the device
   this->reset();