From c7a81d1493b5e0420546b25c9465a64321418d20 Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Wed, 20 Dec 2023 11:57:44 -0800
Subject: [PATCH] adding sockets support to simx and cache subsystem
 refactoring

minor update

minor update

minor updates
---
 hw/rtl/VX_cluster.sv                     |  66 +++++++++-
 hw/rtl/VX_config.vh                      |   9 +-
 hw/rtl/VX_define.vh                      |  18 ++-
 hw/rtl/VX_gpu_pkg.sv                     |   9 +-
 hw/rtl/VX_socket.sv                      |  45 +------
 hw/rtl/VX_types.vh                       |  44 +++----
 hw/rtl/core/VX_core.sv                   |  18 +--
 hw/rtl/core/VX_csr_data.sv               |  15 +--
 hw/rtl/core/VX_issue.sv                  |   2 +-
 hw/rtl/core/VX_lsu_unit.sv               |   8 +-
 hw/rtl/interfaces/VX_pipeline_perf_if.sv |  25 ++--
 runtime/common/utils.cpp                 |  34 +----
 runtime/simx/vortex.cpp                  |   2 +-
 sim/simx/Makefile                        |   2 +-
 sim/simx/arch.h                          |  10 +-
 sim/simx/cluster.cpp                     | 158 ++++++++---------------
 sim/simx/cluster.h                       |  41 +++---
 sim/simx/core.cpp                        | 111 ++++++++--------
 sim/simx/core.h                          |  26 ++--
 sim/simx/exe_unit.cpp                    |  38 +++---
 sim/simx/main.cpp                        |   8 +-
 sim/simx/socket.cpp                      | 146 +++++++++++++++++++++
 sim/simx/socket.h                        |  87 +++++++++++++
 sim/simx/types.h                         |   7 +
 24 files changed, 541 insertions(+), 388 deletions(-)
 create mode 100644 sim/simx/socket.cpp
 create mode 100644 sim/simx/socket.h

diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv
index 90076673..6de47c5f 100644
--- a/hw/rtl/VX_cluster.sv
+++ b/hw/rtl/VX_cluster.sv
@@ -85,8 +85,8 @@ module VX_cluster import VX_gpu_pkg::*; #(
 
     VX_mem_bus_if #(
         .DATA_SIZE (`L1_LINE_SIZE),
-        .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
-    ) per_socket_mem_bus_if[`NUM_SOCKETS]();
+        .TAG_WIDTH (L1_MEM_TAG_WIDTH)
+    ) l1_mem_bus_if[2]();
 
     `RESET_RELAY (l2_reset, reset);
 
@@ -102,7 +102,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
         .MSHR_SIZE      (`L2_MSHR_SIZE),
         .MRSQ_SIZE      (`L2_MRSQ_SIZE),
         .MREQ_SIZE      (`L2_MREQ_SIZE),
-        .TAG_WIDTH      (L1_MEM_ARB_TAG_WIDTH),
+        .TAG_WIDTH      (L1_MEM_TAG_WIDTH),
         .WRITE_ENABLE   (1),
         .UUID_WIDTH     (`UUID_WIDTH),  
         .CORE_OUT_REG   (2),
@@ -115,10 +115,65 @@ module VX_cluster import VX_gpu_pkg::*; #(
     `ifdef PERF_ENABLE
         .cache_perf     (perf_l2cache),
     `endif
-        .core_bus_if    (per_socket_mem_bus_if),
+        .core_bus_if    (l1_mem_bus_if),
         .mem_bus_if     (mem_bus_if)
     );
 
+    VX_mem_bus_if #(
+        .DATA_SIZE (`L1_LINE_SIZE),
+        .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
+    ) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
+
+    VX_mem_bus_if #(
+        .DATA_SIZE (`L1_LINE_SIZE),
+        .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
+    ) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
+
+    VX_mem_bus_if #(
+        .DATA_SIZE (ICACHE_LINE_SIZE),
+        .TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
+    ) icache_mem_bus_if[1]();
+
+    VX_mem_bus_if #(
+        .DATA_SIZE (DCACHE_LINE_SIZE),
+        .TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
+    ) dcache_mem_bus_if[1]();
+
+    `RESET_RELAY (l1_mem_arb_reset, reset);
+
+    VX_mem_arb #(
+        .NUM_INPUTS  (`NUM_SOCKETS),
+        .DATA_SIZE   (`L1_LINE_SIZE),
+        .TAG_WIDTH   (ICACHE_MEM_TAG_WIDTH),
+        .TAG_SEL_IDX (1), // Skip 0 for NC flag
+        .ARBITER     ("R"),
+        .OUT_REG_REQ (2),
+        .OUT_REG_RSP (2)
+    ) icache_mem_arb (
+        .clk        (clk),
+        .reset      (l1_mem_arb_reset),
+        .bus_in_if  (per_socket_icache_mem_bus_if),
+        .bus_out_if (icache_mem_bus_if)
+    );
+
+    VX_mem_arb #(
+        .NUM_INPUTS  (`NUM_SOCKETS),
+        .DATA_SIZE   (`L1_LINE_SIZE),
+        .TAG_WIDTH   (DCACHE_MEM_TAG_WIDTH),
+        .TAG_SEL_IDX (1), // Skip 0 for NC flag
+        .ARBITER     ("R"),
+        .OUT_REG_REQ (2),
+        .OUT_REG_RSP (2)
+    ) dcache_mem_arb (
+        .clk        (clk),
+        .reset      (l1_mem_arb_reset),
+        .bus_in_if  (per_socket_dcache_mem_bus_if),
+        .bus_out_if (dcache_mem_bus_if)
+    );
+
+    `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
+    `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
+
     ///////////////////////////////////////////////////////////////////////////
 
     wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
@@ -155,7 +210,8 @@ module VX_cluster import VX_gpu_pkg::*; #(
             
             .dcr_bus_if     (socket_dcr_bus_if),
 
-            .mem_bus_if     (per_socket_mem_bus_if[i]),
+            .icache_mem_bus_if (per_socket_icache_mem_bus_if[i]),
+            .dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]),
         
         `ifdef GBAR_ENABLE
             .gbar_bus_if    (per_socket_gbar_bus_if[i]),
diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 3af544c6..d35d906b 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -262,7 +262,10 @@
 `endif
 
 // LSU Duplicate Address Check
-`ifdef LSU_DUP
+`ifndef LSU_DUP_DISABLE
+`define LSU_DUP_ENABLE
+`endif
+`ifdef LSU_DUP_ENABLE
 `define LSU_DUP_ENABLED 1
 `else
 `define LSU_DUP_ENABLED 0
@@ -381,7 +384,7 @@
 
 // Number of Cache Units
 `ifndef NUM_ICACHES
-`define NUM_ICACHES `UP(`NUM_CORES / 4)
+`define NUM_ICACHES `UP(`SOCKET_SIZE / 4)
 `endif
 
 // Cache Size
@@ -430,7 +433,7 @@
 
 // Number of Cache Units
 `ifndef NUM_DCACHES
-`define NUM_DCACHES `UP(`NUM_CORES / 4)
+`define NUM_DCACHES `UP(`SOCKET_SIZE / 4)
 `endif
 
 // Cache Size
diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh
index 95d206ce..f39e7fea 100644
--- a/hw/rtl/VX_define.vh
+++ b/hw/rtl/VX_define.vh
@@ -410,8 +410,22 @@
         assign dst = src; \
     end
 
-`define TO_DISPATCH_DATA(data, tid) \
-    {data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data}
+`define TO_DISPATCH_DATA(data, tid) { \
+    data.uuid, \
+    data.wis, \
+    data.tmask, \
+    data.op_type, \
+    data.op_mod, \
+    data.wb, \
+    data.use_PC, \
+    data.use_imm, \
+    data.PC, \
+    data.imm, \
+    data.rd, \
+    tid, \
+    data.rs1_data, \
+    data.rs2_data, \
+    data.rs3_data}
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv
index 668b53ee..b32b9600 100644
--- a/hw/rtl/VX_gpu_pkg.sv
+++ b/hw/rtl/VX_gpu_pkg.sv
@@ -141,8 +141,9 @@ package VX_gpu_pkg;
 
     /////////////////////////////// L1 Parameters /////////////////////////////
 
-    localparam L1_MEM_TAG_WIDTH     = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
-    localparam L1_MEM_ARB_TAG_WIDTH	= (L1_MEM_TAG_WIDTH + `CLOG2(2));
+    localparam ICACHE_MEM_ARB_TAG_WIDTH = (ICACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS));
+    localparam DCACHE_MEM_ARB_TAG_WIDTH = (DCACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS));
+    localparam L1_MEM_TAG_WIDTH         = `MAX(ICACHE_MEM_ARB_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
 
     /////////////////////////////// L2 Parameters /////////////////////////////
 
@@ -150,10 +151,10 @@ package VX_gpu_pkg;
     localparam L2_WORD_SIZE	        = `L1_LINE_SIZE;
 
     // Input request size
-    localparam L2_NUM_REQS	        = `NUM_SOCKETS;
+    localparam L2_NUM_REQS	        = 2;
 
     // Core request tag bits
-    localparam L2_TAG_WIDTH	        = L1_MEM_ARB_TAG_WIDTH;
+    localparam L2_TAG_WIDTH	        = L1_MEM_TAG_WIDTH;
 
     // Memory request data bits
     localparam L2_MEM_DATA_WIDTH	= (`L2_LINE_SIZE * 8);
diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv
index 139598d9..74a074d1 100644
--- a/hw/rtl/VX_socket.sv
+++ b/hw/rtl/VX_socket.sv
@@ -30,7 +30,8 @@ module VX_socket import VX_gpu_pkg::*; #(
     VX_dcr_bus_if.slave     dcr_bus_if,
 
     // Memory
-    VX_mem_bus_if.master    mem_bus_if,
+    VX_mem_bus_if.master    icache_mem_bus_if,
+    VX_mem_bus_if.master    dcache_mem_bus_if,
 
 `ifdef GBAR_ENABLE
     // Barrier
@@ -76,47 +77,7 @@ module VX_socket import VX_gpu_pkg::*; #(
     assign mem_perf_tmp_if.mem = mem_perf_if.mem;
 `endif    
 
-    VX_mem_bus_if #(
-        .DATA_SIZE (ICACHE_LINE_SIZE),
-        .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
-    ) icache_mem_bus_if();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (DCACHE_LINE_SIZE),
-        .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
-    ) dcache_mem_bus_if();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (`L1_LINE_SIZE),
-        .TAG_WIDTH (L1_MEM_TAG_WIDTH)
-    ) cache_mem_bus_if[2]();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (`L1_LINE_SIZE),
-        .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
-    ) mem_bus_tmp_if[1]();
-
-    `ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
-    `ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
-
-    `RESET_RELAY (mem_arb_reset, reset);
-
-    VX_mem_arb #(
-        .NUM_INPUTS   (2),
-        .DATA_SIZE    (`L1_LINE_SIZE),
-        .TAG_WIDTH    (L1_MEM_TAG_WIDTH),
-        .TAG_SEL_IDX  (1), // Skip 0 for NC flag
-        .ARBITER      ("R"),
-        .OUT_REG_REQ  (2),
-        .OUT_REG_RSP  (2)
-    ) mem_arb (
-        .clk        (clk),
-        .reset      (mem_arb_reset),
-        .bus_in_if  (cache_mem_bus_if),
-        .bus_out_if (mem_bus_tmp_if)
-    );
-
-    `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
+    
 
     ///////////////////////////////////////////////////////////////////////////
 
diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index 4fb03783..a5044ccf 100644
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@@ -78,33 +78,25 @@
 `define VX_CSR_MPM_IBUF_ST_H            12'hB85
 `define VX_CSR_MPM_SCRB_ST              12'hB06
 `define VX_CSR_MPM_SCRB_ST_H            12'hB86
-`define VX_CSR_MPM_ALU_ST               12'hB07
-`define VX_CSR_MPM_ALU_ST_H             12'hB87
-`define VX_CSR_MPM_LSU_ST               12'hB08
-`define VX_CSR_MPM_LSU_ST_H             12'hB88
-`define VX_CSR_MPM_FPU_ST               12'hB09
-`define VX_CSR_MPM_FPU_ST_H             12'hB89
-`define VX_CSR_MPM_SFU_ST               12'hB0A
-`define VX_CSR_MPM_SFU_ST_H             12'hB8A
-`define VX_CSR_MPM_SCRB_ALU             12'hB0B
-`define VX_CSR_MPM_SCRB_ALU_H           12'hB8B
-`define VX_CSR_MPM_SCRB_FPU             12'hB0C
-`define VX_CSR_MPM_SCRB_FPU_H           12'hB8C
-`define VX_CSR_MPM_SCRB_LSU             12'hB0D
-`define VX_CSR_MPM_SCRB_LSU_H           12'hB8D
-`define VX_CSR_MPM_SCRB_SFU             12'hB0E
-`define VX_CSR_MPM_SCRB_SFU_H           12'hB8E
+`define VX_CSR_MPM_SCRB_ALU             12'hB07
+`define VX_CSR_MPM_SCRB_ALU_H           12'hB87
+`define VX_CSR_MPM_SCRB_FPU             12'hB08
+`define VX_CSR_MPM_SCRB_FPU_H           12'hB88
+`define VX_CSR_MPM_SCRB_LSU             12'hB09
+`define VX_CSR_MPM_SCRB_LSU_H           12'hB89
+`define VX_CSR_MPM_SCRB_SFU             12'hB0A
+`define VX_CSR_MPM_SCRB_SFU_H           12'hB8A
 // PERF: memory
-`define VX_CSR_MPM_IFETCHES             12'hB0F
-`define VX_CSR_MPM_IFETCHES_H           12'hB8F
-`define VX_CSR_MPM_LOADS                12'hB10
-`define VX_CSR_MPM_LOADS_H              12'hB90
-`define VX_CSR_MPM_STORES               12'hB11
-`define VX_CSR_MPM_STORES_H             12'hB91
-`define VX_CSR_MPM_IFETCH_LT            12'hB12
-`define VX_CSR_MPM_IFETCH_LT_H          12'hB92
-`define VX_CSR_MPM_LOAD_LT              12'hB13 
-`define VX_CSR_MPM_LOAD_LT_H            12'hB93
+`define VX_CSR_MPM_IFETCHES             12'hB0B
+`define VX_CSR_MPM_IFETCHES_H           12'hB8B
+`define VX_CSR_MPM_LOADS                12'hB0C
+`define VX_CSR_MPM_LOADS_H              12'hB8C
+`define VX_CSR_MPM_STORES               12'hB0D
+`define VX_CSR_MPM_STORES_H             12'hB8D
+`define VX_CSR_MPM_IFETCH_LT            12'hB0E
+`define VX_CSR_MPM_IFETCH_LT_H          12'hB8E
+`define VX_CSR_MPM_LOAD_LT              12'hB0F 
+`define VX_CSR_MPM_LOAD_LT_H            12'hB8F
 
 // Machine Performance-monitoring memory counters
 // PERF: icache
diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv
index 5aba3075..4d3ce297 100644
--- a/hw/rtl/core/VX_core.sv
+++ b/hw/rtl/core/VX_core.sv
@@ -273,23 +273,23 @@ module VX_core import VX_gpu_pkg::*; #(
     wire [1:0] perf_icache_pending_read_cycle;
     wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
 
-    reg  [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
-    reg  [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
+    reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
+    reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
 
-    reg  [`PERF_CTR_BITS-1:0] perf_ifetches;
-    reg  [`PERF_CTR_BITS-1:0] perf_loads;
-    reg  [`PERF_CTR_BITS-1:0] perf_stores;
+    reg [`PERF_CTR_BITS-1:0] perf_ifetches;
+    reg [`PERF_CTR_BITS-1:0] perf_loads;
+    reg [`PERF_CTR_BITS-1:0] perf_stores;
 
-    wire  perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
-    wire  perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
+    wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
+    wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
 
     wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
     wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
     wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
 
     for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
-        assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
-        assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
+        assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
+        assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
         assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
     end
 
diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv
index 6d7c41f8..1b370260 100644
--- a/hw/rtl/core/VX_csr_data.sv
+++ b/hw/rtl/core/VX_csr_data.sv
@@ -195,19 +195,6 @@ import VX_fpu_pkg::*;
                         `VX_CSR_MPM_IBUF_ST_H      : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
                         `VX_CSR_MPM_SCRB_ST        : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
                         `VX_CSR_MPM_SCRB_ST_H      : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
-                        `VX_CSR_MPM_ALU_ST         : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
-                        `VX_CSR_MPM_ALU_ST_H       : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
-                        `VX_CSR_MPM_LSU_ST         : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
-                        `VX_CSR_MPM_LSU_ST_H       : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
-                    `ifdef EXT_F_ENABLE
-                        `VX_CSR_MPM_FPU_ST         : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0];
-                        `VX_CSR_MPM_FPU_ST_H       : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
-                    `else
-                        `VX_CSR_MPM_FPU_ST         : read_data_ro_r = '0;
-                        `VX_CSR_MPM_FPU_ST_H       : read_data_ro_r = '0;
-                    `endif
-                        `VX_CSR_MPM_SFU_ST         : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
-                        `VX_CSR_MPM_SFU_ST_H       : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
                         `VX_CSR_MPM_SCRB_ALU       : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
                         `VX_CSR_MPM_SCRB_ALU_H     : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
                     `ifdef EXT_F_ENABLE
@@ -220,7 +207,7 @@ import VX_fpu_pkg::*;
                         `VX_CSR_MPM_SCRB_LSU       : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
                         `VX_CSR_MPM_SCRB_LSU_H     : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
                         `VX_CSR_MPM_SCRB_SFU       : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
-                        `VX_CSR_MPM_SCRB_SFU_H     : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
+                        `VX_CSR_MPM_SCRB_SFU_H     : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];                        
                         // PERF: memory
                         `VX_CSR_MPM_IFETCHES       : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
                         `VX_CSR_MPM_IFETCHES_H     : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); 
diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv
index 8d0eaff6..912abc97 100644
--- a/hw/rtl/core/VX_issue.sv
+++ b/hw/rtl/core/VX_issue.sv
@@ -84,7 +84,7 @@ module VX_issue #(
         .clk            (clk), 
         .reset          (dispatch_reset),
     `ifdef PERF_ENABLE
-        .perf_stalls    (perf_issue_if.dsp_stalls),
+        `UNUSED_PIN     (perf_stalls),
     `endif
         .operands_if    (operands_if),
         .alu_dispatch_if(alu_dispatch_if),
diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv
index 1e0a09b8..5a57db4c 100644
--- a/hw/rtl/core/VX_lsu_unit.sv
+++ b/hw/rtl/core/VX_lsu_unit.sv
@@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
     // detect duplicate addresses
 
     wire lsu_is_dup;
-`ifdef LSU_DUP
+`ifdef LSU_DUP_ENABLE
     if (NUM_LANES > 1) begin    
         wire [NUM_LANES-2:0] addr_matches;
         for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
@@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
 
     assign mem_req_tag = {
         execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
-    `ifdef LSU_DUP
+    `ifdef LSU_DUP_ENABLE
         , lsu_is_dup
     `endif
     };
@@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
     wire [PID_WIDTH-1:0] rsp_pid;
     wire rsp_is_dup;
     
-`ifndef LSU_DUP
+`ifndef LSU_DUP_ENABLE
     assign rsp_is_dup = 0;
 `endif
 
     assign {
         rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
-    `ifdef LSU_DUP
+    `ifdef LSU_DUP_ENABLE
         , rsp_is_dup
     `endif
     } = mem_rsp_tag;
diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv
index 66225336..2ae0f678 100644
--- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv
+++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv
@@ -14,18 +14,17 @@
 `include "VX_define.vh"
 
 interface VX_pipeline_perf_if ();
-    wire [`PERF_CTR_BITS-1:0]   sched_idles;
-    wire [`PERF_CTR_BITS-1:0]   sched_stalls;
-    wire [`PERF_CTR_BITS-1:0]   ibf_stalls;
-    wire [`PERF_CTR_BITS-1:0]   scb_stalls;
-    wire [`PERF_CTR_BITS-1:0]   scb_uses [`NUM_EX_UNITS];
-    wire [`PERF_CTR_BITS-1:0]   dsp_stalls [`NUM_EX_UNITS];
+    wire [`PERF_CTR_BITS-1:0] sched_idles;
+    wire [`PERF_CTR_BITS-1:0] sched_stalls;
+    wire [`PERF_CTR_BITS-1:0] ibf_stalls;
+    wire [`PERF_CTR_BITS-1:0] scb_stalls;
+    wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
 
-    wire [`PERF_CTR_BITS-1:0]   ifetches;
-    wire [`PERF_CTR_BITS-1:0]   loads;
-    wire [`PERF_CTR_BITS-1:0]   stores;    
-    wire [`PERF_CTR_BITS-1:0]   ifetch_latency;
-    wire [`PERF_CTR_BITS-1:0]   load_latency;
+    wire [`PERF_CTR_BITS-1:0] ifetches;
+    wire [`PERF_CTR_BITS-1:0] loads;
+    wire [`PERF_CTR_BITS-1:0] stores;    
+    wire [`PERF_CTR_BITS-1:0] ifetch_latency;
+    wire [`PERF_CTR_BITS-1:0] load_latency;
 
     modport schedule (
         output sched_idles,
@@ -35,8 +34,7 @@ interface VX_pipeline_perf_if ();
     modport issue (
         output ibf_stalls,
         output scb_stalls,
-        output scb_uses,
-        output dsp_stalls
+        output scb_uses
     );
 
     modport slave (
@@ -45,7 +43,6 @@ interface VX_pipeline_perf_if ();
         input ibf_stalls,
         input scb_stalls,
         input scb_uses,
-        input dsp_stalls,
         input ifetches,
         input loads,
         input stores,
diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp
index c0199a86..5f472c84 100644
--- a/runtime/common/utils.cpp
+++ b/runtime/common/utils.cpp
@@ -204,10 +204,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
   uint64_t sched_stalls = 0;
   uint64_t ibuffer_stalls = 0;
   uint64_t scrb_stalls = 0;
-  uint64_t lsu_stalls = 0;
-  uint64_t fpu_stalls = 0;
-  uint64_t alu_stalls = 0;
-  uint64_t sfu_stalls = 0;
   uint64_t scrb_alu = 0;
   uint64_t scrb_fpu = 0;
   uint64_t scrb_lsu = 0;
@@ -310,34 +306,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
           calcAvgPercent(scrb_sfu_per_core, scrb_total));
         scrb_stalls += scrb_stalls_per_core;
       }
-      // alu_stalls
-      {
-        uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
-        alu_stalls += alu_stalls_per_core;      
-      }
-      // lsu_stalls
-      {
-        uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
-        lsu_stalls += lsu_stalls_per_core;
-      }
-      // fpu_stalls
-      {
-        uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
-        fpu_stalls += fpu_stalls_per_core;      
-      }
-      // sfu_stalls
-      {
-        uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
-        sfu_stalls += sfu_stalls_per_core;
-      }
       // PERF: memory
       // ifetches
       {
-        uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
+        uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCHES);
         if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
         ifetches += ifetches_per_core;
 
@@ -464,10 +436,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
       calcAvgPercent(scrb_fpu, scrb_total),
       calcAvgPercent(scrb_lsu, scrb_total),
       calcAvgPercent(scrb_sfu, scrb_total));
-    fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
-    fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
-    fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
-    fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls);
     fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
     fprintf(stream, "PERF: loads=%ld\n", loads);
     fprintf(stream, "PERF: stores=%ld\n", stores);    
diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp
index 3b4cb171..b7b9cdcb 100644
--- a/runtime/simx/vortex.cpp
+++ b/runtime/simx/vortex.cpp
@@ -87,7 +87,7 @@ private:
 class vx_device {    
 public:
     vx_device() 
-        : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS)
+        : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
         , ram_(RAM_PAGE_SIZE)
         , processor_(arch_)
         , global_mem_(
diff --git a/sim/simx/Makefile b/sim/simx/Makefile
index 42823205..bb67dbb5 100644
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -15,7 +15,7 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
 
 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
-SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
+SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
 
 # Debugigng
 ifdef DEBUG
diff --git a/sim/simx/arch.h b/sim/simx/arch.h
index ab6ac4a3..099fbedd 100644
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -28,6 +28,7 @@ private:
   uint16_t num_warps_;
   uint16_t num_cores_;  
   uint16_t num_clusters_;  
+  uint16_t socket_size_;
   uint16_t vsize_;
   uint16_t num_regs_;
   uint16_t num_csrs_;
@@ -35,11 +36,12 @@ private:
   uint16_t ipdom_size_;
   
 public:
-  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)   
+  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)   
     : num_threads_(num_threads)
     , num_warps_(num_warps)
     , num_cores_(num_cores)
-    , num_clusters_(num_clusters)
+    , num_clusters_(NUM_CLUSTERS)
+    , socket_size_(SOCKET_SIZE)
     , vsize_(16)
     , num_regs_(32)
     , num_csrs_(4096)
@@ -82,6 +84,10 @@ public:
   uint16_t num_clusters() const {
     return num_clusters_;
   }
+
+  uint16_t socket_size() const {
+    return socket_size_;
+  }
 };
 
 }
\ No newline at end of file
diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp
index d7104915..7f690fb6 100644
--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@@ -24,14 +24,38 @@ Cluster::Cluster(const SimContext& ctx,
   , mem_req_port(this)
   , mem_rsp_port(this)
   , cluster_id_(cluster_id)
-  , cores_(arch.num_cores())  
+  , sockets_(NUM_SOCKETS)  
   , barriers_(arch.num_barriers(), 0)
-  , sharedmems_(arch.num_cores())
   , processor_(processor)
+  , cores_per_socket_(arch.socket_size())
 {
-  auto num_cores = arch.num_cores();
-  
   char sname[100];
+
+  auto sockets_per_cluster = sockets_.size();
+
+  // create sockets
+
+  snprintf(sname, 100, "cluster%d-icache-arb", cluster_id);
+  auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
+
+  snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id);
+  auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
+
+  for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
+    uint32_t socket_id = cluster_id * sockets_per_cluster + i;
+    auto socket = Socket::Create(socket_id, this, arch, dcrs);
+
+    socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
+    icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
+
+    socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i));
+    dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port);
+
+    sockets_.at(i) = socket;
+  }
+
+  // Create l2cache
+  
   snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
   l2cache_ = CacheSim::Create(sname, CacheSim::Config{
     !L2_ENABLED,
@@ -42,7 +66,7 @@ Cluster::Cluster(const SimContext& ctx,
     log2ceil(L2_NUM_BANKS), // B
     XLEN,                   // address bits  
     1,                      // number of ports
-    5,                      // request size 
+    2,                      // request size 
     true,                   // write-through
     false,                  // write response
     L2_MSHR_SIZE,           // mshr
@@ -52,87 +76,11 @@ Cluster::Cluster(const SimContext& ctx,
   l2cache_->MemReqPort.bind(&this->mem_req_port);
   this->mem_rsp_port.bind(&l2cache_->MemRspPort);
 
-  snprintf(sname, 100, "cluster%d-icaches", cluster_id);
-  icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
-    !ICACHE_ENABLED,
-    log2ceil(ICACHE_SIZE),  // C
-    log2ceil(L1_LINE_SIZE), // L
-    log2ceil(sizeof(uint32_t)), // W
-    log2ceil(ICACHE_NUM_WAYS),// A
-    1,                      // B
-    XLEN,                   // address bits
-    1,                      // number of ports
-    1,                      // number of inputs
-    true,                   // write-through
-    false,                  // write response
-    (uint8_t)arch.num_warps(), // mshr
-    2,                      // pipeline latency
-  });
+  icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0));
+  l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0));
 
-  icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
-  l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
-
-  snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
-  dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
-    !DCACHE_ENABLED,
-    log2ceil(DCACHE_SIZE),  // C
-    log2ceil(L1_LINE_SIZE), // L
-    log2ceil(sizeof(Word)), // W
-    log2ceil(DCACHE_NUM_WAYS),// A
-    log2ceil(DCACHE_NUM_BANKS), // B
-    XLEN,                   // address bits
-    1,                      // number of ports
-    DCACHE_NUM_BANKS,       // number of inputs
-    true,                   // write-through
-    false,                  // write response
-    DCACHE_MSHR_SIZE,       // mshr
-    4,                      // pipeline latency
-  });
-
-  dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
-  l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
-
-  ///////////////////////////////////////////////////////////////////////////
-
-  // create shared memory blocks
-  for (uint32_t i = 0; i < num_cores; ++i) {
-    snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
-    sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
-      (1 << SMEM_LOG_SIZE),
-      sizeof(Word),
-      NUM_LSU_LANES, 
-      NUM_LSU_LANES,
-      false
-    });
-  }
-
-  // create cores
-
-  for (uint32_t i = 0; i < num_cores; ++i) {  
-    uint32_t core_id = cluster_id * num_cores + i;
-    cores_.at(i) = Core::Create(core_id, 
-                                this, 
-                                arch, 
-                                dcrs, 
-                                sharedmems_.at(i));
-
-    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
-    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      
-
-    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
-      snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
-      auto smem_demux = SMemDemux::Create(sname);
-      
-      cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
-      smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));        
-      
-      smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j));
-      dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC);
-
-      smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j));
-      sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM);
-    }
-  }
+  dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1));
+  l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0));
 }
 
 Cluster::~Cluster() {
@@ -150,14 +98,14 @@ void Cluster::tick() {
 }
 
 void Cluster::attach_ram(RAM* ram) {
-  for (auto core : cores_) {
-    core->attach_ram(ram);
+  for (auto& socket : sockets_) {
+    socket->attach_ram(ram);
   }
 }
 
 bool Cluster::running() const {
-  for (auto& core : cores_) {
-    if (core->running())
+  for (auto& socket : sockets_) {
+    if (socket->running())
       return true;
   }
   return false;
@@ -166,9 +114,9 @@ bool Cluster::running() const {
 bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
   bool done = true;
   Word exitcode_ = 0;
-  for (auto& core : cores_) {
+  for (auto& socket : sockets_) {
     Word ec;
-    if (core->check_exit(&ec, riscv_test)) {
+    if (socket->check_exit(&ec, riscv_test)) {
       exitcode_ |= ec;
     } else {
       done = false;
@@ -181,36 +129,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
 void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
   auto& barrier = barriers_.at(bar_id);
 
-  uint32_t local_core_id = core_id % cores_.size();
+  auto sockets_per_cluster = sockets_.size();
+  auto cores_per_socket = cores_per_socket_;
+
+  uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket;
+  uint32_t local_core_id = core_id % cores_per_cluster;
   barrier.set(local_core_id);
 
   DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
 
   if (barrier.count() == (size_t)count) {
       // resume all suspended cores
-      for (uint32_t i = 0; i < cores_.size(); ++i) {
-        if (barrier.test(i)) {
-          DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
-          cores_.at(i)->resume();
+      for (uint32_t s = 0; s < sockets_per_cluster; ++s) {
+        for (uint32_t c = 0; c < cores_per_socket; ++c) {
+          uint32_t i = s * cores_per_socket + c;
+          if (barrier.test(i)) {
+            DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
+            sockets_.at(s)->resume(c);
+          }
         }
       }
       barrier.reset();
     }
 }
 
-ProcessorImpl* Cluster::processor() const {
-  return processor_;
-}
-
 Cluster::PerfStats Cluster::perf_stats() const {
   Cluster::PerfStats perf;
-  perf.icache = icaches_->perf_stats();
-  perf.dcache = dcaches_->perf_stats();
   perf.l2cache = l2cache_->perf_stats();
-
-  for (auto sharedmem : sharedmems_) {
-    perf.sharedmem += sharedmem->perf_stats();
-  }
-  
   return perf;
 }
\ No newline at end of file
diff --git a/sim/simx/cluster.h b/sim/simx/cluster.h
index f91241e9..2547486d 100644
--- a/sim/simx/cluster.h
+++ b/sim/simx/cluster.h
@@ -17,8 +17,8 @@
 #include "dcrs.h"
 #include "arch.h"
 #include "cache_cluster.h"
-#include "shared_mem.h"
 #include "core.h"
+#include "socket.h"
 #include "constants.h"
 
 namespace vortex {
@@ -27,17 +27,11 @@ class ProcessorImpl;
 
 class Cluster : public SimObject<Cluster> {
 public:
-  struct PerfStats {
-    CacheSim::PerfStats   icache;
-    CacheSim::PerfStats   dcache;
-    SharedMem::PerfStats  sharedmem;
-    CacheSim::PerfStats   l2cache;
+  struct PerfStats {    
+    CacheSim::PerfStats l2cache;
 
     PerfStats& operator+=(const PerfStats& rhs) {
-      this->icache      += rhs.icache;
-      this->dcache      += rhs.dcache;
-      this->sharedmem   += rhs.sharedmem;
-      this->l2cache     += rhs.l2cache;
+      this->l2cache += rhs.l2cache;
       return *this;
     }
   };
@@ -53,6 +47,14 @@ public:
 
   ~Cluster();
 
+  uint32_t id() const {
+    return cluster_id_;
+  }
+
+  ProcessorImpl* processor() const {
+    return processor_;
+  }
+
   void reset();
 
   void tick();
@@ -65,22 +67,15 @@ public:
 
   void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
 
-  ProcessorImpl* processor() const;
-
   Cluster::PerfStats perf_stats() const;
   
 private:
-  uint32_t                     cluster_id_;  
-  std::vector<Core::Ptr>       cores_;  
-  std::vector<CoreMask>        barriers_;
-  CacheSim::Ptr                l2cache_;
-  CacheCluster::Ptr            icaches_;
-  CacheCluster::Ptr            dcaches_;
-  std::vector<SharedMem::Ptr>  sharedmems_;
-  CacheCluster::Ptr            tcaches_;
-  CacheCluster::Ptr            ocaches_;
-  CacheCluster::Ptr            rcaches_;
-  ProcessorImpl*               processor_;
+  uint32_t                  cluster_id_;  
+  std::vector<Socket::Ptr>  sockets_;  
+  std::vector<CoreMask>     barriers_;
+  CacheSim::Ptr             l2cache_;
+  ProcessorImpl*            processor_;
+  uint32_t                  cores_per_socket_;
 };
 
 } // namespace vortex
\ No newline at end of file
diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp
index 49c2ec35..7a549ebd 100644
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@@ -21,18 +21,14 @@
 #include "mem.h"
 #include "decode.h"
 #include "core.h"
+#include "socket.h"
 #include "debug.h"
 #include "constants.h"
 #include "processor_impl.h"
 
 using namespace vortex;
 
-Core::Core(const SimContext& ctx, 
-           uint32_t core_id, 
-           Cluster* cluster,
-           const Arch &arch, 
-           const DCRS &dcrs,
-           SharedMem::Ptr  sharedmem)
+Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs)
     : SimObject(ctx, "core")
     , icache_req_ports(1, this)
     , icache_rsp_ports(1, this)
@@ -50,12 +46,12 @@ Core::Core(const SimContext& ctx,
     , operands_(ISSUE_WIDTH)
     , dispatchers_((uint32_t)ExeType::ExeTypeCount)
     , exe_units_((uint32_t)ExeType::ExeTypeCount)
-    , sharedmem_(sharedmem)
+    , smem_demuxs_(NUM_LSU_LANES)
     , fetch_latch_("fetch")
     , decode_latch_("decode")
     , pending_icache_(arch_.num_warps())
     , csrs_(arch.num_warps())
-    , cluster_(cluster)
+    , socket_(socket)
     , commit_arbs_(ISSUE_WIDTH)
 {
   char sname[100];
@@ -72,6 +68,27 @@ Core::Core(const SimContext& ctx,
     operands_.at(i) = SimPlatform::instance().create_object<Operand>();
   }
 
+  // initialize shared memory
+  shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
+    (1 << SMEM_LOG_SIZE),
+    sizeof(Word),
+    NUM_LSU_LANES, 
+    NUM_LSU_LANES,
+    false
+  });
+  for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
+      snprintf(sname, 100, "smem_demux%d_%d", core_id, i);
+      auto smem_demux = SMemDemux::Create(sname);
+      
+      smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
+      dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
+
+      smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
+      shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
+
+      smem_demuxs_.at(i) = smem_demux;
+    }
+
   // initialize dispatchers
   dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
   dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
@@ -241,13 +258,6 @@ void Core::decode() {
     stalled_warps_.reset(trace->wid);
   }
 
-  // update perf counters
-  uint32_t active_threads = trace->tmask.count();
-  if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD)
-    perf_stats_.loads += active_threads;
-  if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE) 
-    perf_stats_.stores += active_threads;
-
   DT(3, "pipeline-decode: " << *trace);
 
   // insert to ibuffer 
@@ -394,7 +404,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
   if (is_global) {
     // global barrier handling
     if (barrier.count() == active_warps_.count()) {
-      cluster_->barrier(bar_idx, count, core_id_);
+      socket_->barrier(bar_idx, count, core_id_);
       barrier.reset();
     }    
   } else {
@@ -431,7 +441,7 @@ AddrType Core::get_addr_type(uint64_t addr) {
 void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {  
   auto type = this->get_addr_type(addr);
   if (type == AddrType::Shared) {
-    sharedmem_->read(data, addr, size);
+    shared_mem_->read(data, addr, size);
   } else {  
     mmu_.read(data, addr, size, 0);
   }
@@ -446,7 +456,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
      this->writeToStdOut(data, addr, size);
   } else {
     if (type == AddrType::Shared) {
-      sharedmem_->write(data, addr, size);
+      shared_mem_->write(data, addr, size);
     } else {
       mmu_.write(data, addr, size, 0);
     }
@@ -554,16 +564,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
         case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
         case VX_CSR_MPM_IBUF_ST:   return perf_stats_.ibuf_stalls & 0xffffffff; 
         case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32; 
-        case VX_CSR_MPM_SCRB_ST:   return perf_stats_.scrb_stalls & 0xffffffff; 
-        case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32; 
-        case VX_CSR_MPM_ALU_ST:    return perf_stats_.alu_stalls & 0xffffffff; 
-        case VX_CSR_MPM_ALU_ST_H:  return perf_stats_.alu_stalls >> 32; 
-        case VX_CSR_MPM_LSU_ST:    return perf_stats_.lsu_stalls & 0xffffffff; 
-        case VX_CSR_MPM_LSU_ST_H:  return perf_stats_.lsu_stalls >> 32;
-        case VX_CSR_MPM_FPU_ST:    return perf_stats_.fpu_stalls & 0xffffffff; 
-        case VX_CSR_MPM_FPU_ST_H:  return perf_stats_.fpu_stalls >> 32; 
-        case VX_CSR_MPM_SFU_ST:    return perf_stats_.sfu_stalls & 0xffffffff; 
-        case VX_CSR_MPM_SFU_ST_H:  return perf_stats_.sfu_stalls >> 32; 
+        case VX_CSR_MPM_SCRB_ST:   return perf_stats_.scrb_stalls & 0xffffffff;
+        case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
         case VX_CSR_MPM_SCRB_ALU:  return perf_stats_.scrb_alu & 0xffffffff;
         case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
         case VX_CSR_MPM_SCRB_FPU:  return perf_stats_.scrb_fpu & 0xffffffff;
@@ -572,7 +574,6 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
         case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
         case VX_CSR_MPM_SCRB_SFU:  return perf_stats_.scrb_sfu & 0xffffffff;
         case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
-        
         case VX_CSR_MPM_IFETCHES:  return perf_stats_.ifetches & 0xffffffff; 
         case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; 
         case VX_CSR_MPM_LOADS:     return perf_stats_.loads & 0xffffffff; 
@@ -586,27 +587,29 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
        }
       } break; 
       case VX_DCR_MPM_CLASS_MEM: {
-        auto proc_perf = cluster_->processor()->perf_stats();
+        auto proc_perf = socket_->cluster()->processor()->perf_stats();
+        auto socket_perf = socket_->perf_stats();
+        auto smem_perf = shared_mem_->perf_stats();
         switch (addr) {
-        case VX_CSR_MPM_ICACHE_READS:     return proc_perf.clusters.icache.reads & 0xffffffff; 
-        case VX_CSR_MPM_ICACHE_READS_H:   return proc_perf.clusters.icache.reads >> 32; 
-        case VX_CSR_MPM_ICACHE_MISS_R:    return proc_perf.clusters.icache.read_misses & 0xffffffff;
-        case VX_CSR_MPM_ICACHE_MISS_R_H:  return proc_perf.clusters.icache.read_misses >> 32;
-        case VX_CSR_MPM_ICACHE_MSHR_ST:   return proc_perf.clusters.icache.mshr_stalls & 0xffffffff; 
-        case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32;
+        case VX_CSR_MPM_ICACHE_READS:     return socket_perf.icache.reads & 0xffffffff; 
+        case VX_CSR_MPM_ICACHE_READS_H:   return socket_perf.icache.reads >> 32; 
+        case VX_CSR_MPM_ICACHE_MISS_R:    return socket_perf.icache.read_misses & 0xffffffff;
+        case VX_CSR_MPM_ICACHE_MISS_R_H:  return socket_perf.icache.read_misses >> 32;
+        case VX_CSR_MPM_ICACHE_MSHR_ST:   return socket_perf.icache.mshr_stalls & 0xffffffff; 
+        case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
         
-        case VX_CSR_MPM_DCACHE_READS:     return proc_perf.clusters.dcache.reads & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_READS_H:   return proc_perf.clusters.dcache.reads >> 32; 
-        case VX_CSR_MPM_DCACHE_WRITES:    return proc_perf.clusters.dcache.writes & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_WRITES_H:  return proc_perf.clusters.dcache.writes >> 32; 
-        case VX_CSR_MPM_DCACHE_MISS_R:    return proc_perf.clusters.dcache.read_misses & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_MISS_R_H:  return proc_perf.clusters.dcache.read_misses >> 32; 
-        case VX_CSR_MPM_DCACHE_MISS_W:    return proc_perf.clusters.dcache.write_misses & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_MISS_W_H:  return proc_perf.clusters.dcache.write_misses >> 32; 
-        case VX_CSR_MPM_DCACHE_BANK_ST:   return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32;
-        case VX_CSR_MPM_DCACHE_MSHR_ST:   return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32;
+        case VX_CSR_MPM_DCACHE_READS:     return socket_perf.dcache.reads & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_READS_H:   return socket_perf.dcache.reads >> 32; 
+        case VX_CSR_MPM_DCACHE_WRITES:    return socket_perf.dcache.writes & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_WRITES_H:  return socket_perf.dcache.writes >> 32; 
+        case VX_CSR_MPM_DCACHE_MISS_R:    return socket_perf.dcache.read_misses & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_MISS_R_H:  return socket_perf.dcache.read_misses >> 32; 
+        case VX_CSR_MPM_DCACHE_MISS_W:    return socket_perf.dcache.write_misses & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_MISS_W_H:  return socket_perf.dcache.write_misses >> 32; 
+        case VX_CSR_MPM_DCACHE_BANK_ST:   return socket_perf.dcache.bank_stalls & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
+        case VX_CSR_MPM_DCACHE_MSHR_ST:   return socket_perf.dcache.mshr_stalls & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
 
         case VX_CSR_MPM_L2CACHE_READS:    return proc_perf.clusters.l2cache.reads & 0xffffffff; 
         case VX_CSR_MPM_L2CACHE_READS_H:  return proc_perf.clusters.l2cache.reads >> 32; 
@@ -641,12 +644,12 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
         case VX_CSR_MPM_MEM_LT:           return proc_perf.mem_latency & 0xffffffff; 
         case VX_CSR_MPM_MEM_LT_H :        return proc_perf.mem_latency >> 32;
          
-        case VX_CSR_MPM_SMEM_READS:       return proc_perf.clusters.sharedmem.reads & 0xffffffff;
-        case VX_CSR_MPM_SMEM_READS_H:     return proc_perf.clusters.sharedmem.reads >> 32;
-        case VX_CSR_MPM_SMEM_WRITES:      return proc_perf.clusters.sharedmem.writes & 0xffffffff;
-        case VX_CSR_MPM_SMEM_WRITES_H:    return proc_perf.clusters.sharedmem.writes >> 32;
-        case VX_CSR_MPM_SMEM_BANK_ST:     return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; 
-        case VX_CSR_MPM_SMEM_BANK_ST_H:   return proc_perf.clusters.sharedmem.bank_stalls >> 32; 
+        case VX_CSR_MPM_SMEM_READS:       return smem_perf.reads & 0xffffffff;
+        case VX_CSR_MPM_SMEM_READS_H:     return smem_perf.reads >> 32;
+        case VX_CSR_MPM_SMEM_WRITES:      return smem_perf.writes & 0xffffffff;
+        case VX_CSR_MPM_SMEM_WRITES_H:    return smem_perf.writes >> 32;
+        case VX_CSR_MPM_SMEM_BANK_ST:     return smem_perf.bank_stalls & 0xffffffff; 
+        case VX_CSR_MPM_SMEM_BANK_ST_H:   return smem_perf.bank_stalls >> 32; 
         }
       } break;
       }
diff --git a/sim/simx/core.h b/sim/simx/core.h
index cef60e81..343fdb31 100644
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -40,7 +40,7 @@
 
 namespace vortex {
 
-class Cluster;
+class Socket;
 
 using TraceSwitch = Mux<pipeline_trace_t*>;
 
@@ -53,10 +53,6 @@ public:
     uint64_t sched_stalls;
     uint64_t ibuf_stalls;
     uint64_t scrb_stalls;
-    uint64_t alu_stalls;
-    uint64_t lsu_stalls;
-    uint64_t fpu_stalls;
-    uint64_t sfu_stalls;
     uint64_t scrb_alu;
     uint64_t scrb_fpu;
     uint64_t scrb_lsu;
@@ -74,10 +70,6 @@ public:
       , sched_stalls(0)
       , ibuf_stalls(0)
       , scrb_stalls(0)
-      , alu_stalls(0)
-      , lsu_stalls(0)
-      , fpu_stalls(0)
-      , sfu_stalls(0)
       , scrb_alu(0)
       , scrb_fpu(0)
       , scrb_lsu(0)
@@ -96,12 +88,7 @@ public:
   std::vector<SimPort<MemReq>> dcache_req_ports;
   std::vector<SimPort<MemRsp>> dcache_rsp_ports;
 
-  Core(const SimContext& ctx, 
-       uint32_t core_id, 
-       Cluster* cluster,
-       const Arch &arch, 
-       const DCRS &dcrs,
-       SharedMem::Ptr  sharedmem);
+  Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs);
 
   ~Core();
 
@@ -119,6 +106,10 @@ public:
     return core_id_;
   }
 
+  Socket* socket() const {
+    return socket_;
+  }
+
   const Arch& arch() const {
     return arch_;
   }
@@ -181,7 +172,8 @@ private:
   std::vector<Operand::Ptr> operands_;
   std::vector<Dispatcher::Ptr> dispatchers_;
   std::vector<ExeUnit::Ptr> exe_units_;
-  SharedMem::Ptr sharedmem_;
+  SharedMem::Ptr shared_mem_;
+  std::vector<SMemDemux::Ptr> smem_demuxs_;
 
   PipelineLatch fetch_latch_;
   PipelineLatch decode_latch_;
@@ -201,7 +193,7 @@ private:
   
   PerfStats perf_stats_;
   
-  Cluster* cluster_;
+  Socket* socket_;
 
   std::vector<TraceSwitch::Ptr> commit_arbs_;
 
diff --git a/sim/simx/exe_unit.cpp b/sim/simx/exe_unit.cpp
index 2f3e79e3..4b5cb356 100644
--- a/sim/simx/exe_unit.cpp
+++ b/sim/simx/exe_unit.cpp
@@ -51,8 +51,7 @@ void AluUnit::tick() {
             assert(core_->stalled_warps_.test(trace->wid));
             core_->stalled_warps_.reset(trace->wid);
         }
-        auto time = input.pop();
-        core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
+        input.pop();
     }
 }
 
@@ -87,8 +86,7 @@ void FpuUnit::tick() {
             std::abort();
         }    
         DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
-        auto time = input.pop();
-        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
+        input.pop();
     }
 }
 
@@ -114,7 +112,7 @@ void LsuUnit::tick() {
 
     // handle dcache response    
     for (uint32_t t = 0; t < num_lanes_; ++t) {
-        auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
+        auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn;
         if (dcache_rsp_port.empty())
             continue;
         auto& mem_rsp = dcache_rsp_port.front();
@@ -136,7 +134,7 @@ void LsuUnit::tick() {
 
     // handle shared memory response
     for (uint32_t t = 0; t < num_lanes_; ++t) {
-        auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
+        auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
         if (smem_rsp_port.empty())
             continue;
         auto& mem_rsp = smem_rsp_port.front();
@@ -184,8 +182,7 @@ void LsuUnit::tick() {
             fence_lock_ = true;        
             DT(3, "fence-lock: " << *trace);
             // remove input
-            auto time = input.pop(); 
-            core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+            input.pop(); 
             break;
         }
 
@@ -213,7 +210,9 @@ void LsuUnit::tick() {
                 auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
                 matches += (addr0 == mem_addr);
             }
+        #ifdef LSU_DUP_ENABLE
             is_dup = (matches == trace->tmask.count());
+        #endif
         }
 
         uint32_t addr_count;
@@ -229,7 +228,7 @@ void LsuUnit::tick() {
             if (!trace->tmask.test(t0 + t))
                 continue;
             
-            auto& dcache_req_port = core_->dcache_req_ports.at(t);
+            auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn;
             auto mem_addr = trace_data->mem_addrs.at(t);
             auto type = core_->get_addr_type(mem_addr.addr);
 
@@ -241,12 +240,16 @@ void LsuUnit::tick() {
             mem_req.cid   = trace->cid;
             mem_req.uuid  = trace->uuid;        
                 
-            dcache_req_port.send(mem_req, 2);
+            dcache_req_port.send(mem_req, 1);
             DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag 
                 << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
 
-            ++pending_loads_;
-            ++core_->perf_stats_.loads;        
+            if (is_write) {
+                ++core_->perf_stats_.stores;
+            } else {                
+                ++core_->perf_stats_.loads;
+                ++pending_loads_;
+            }
             if (is_dup)
                 break;
         }
@@ -254,13 +257,11 @@ void LsuUnit::tick() {
         // do not wait on writes
         if (is_write) {
             pending_rd_reqs_.release(tag);
-            output.send(trace, 1);
-            ++core_->perf_stats_.stores;
+            output.send(trace, 1);            
         }
 
         // remove input
-        auto time = input.pop();
-        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+        input.pop();
 
         break; // single block
     }
@@ -318,10 +319,7 @@ void SfuUnit::tick() {
             core_->stalled_warps_.reset(trace->wid);
         }
 
-        auto time = input.pop();
-        auto stalls = (SimPlatform::instance().cycles() - time);
-
-        core_->perf_stats_.sfu_stalls += stalls;
+        input.pop();
 
         break; // single block
     }
diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp
index 22d9c880..64031bb8 100644
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -34,14 +34,13 @@ static void show_usage() {
 uint32_t num_threads = NUM_THREADS;
 uint32_t num_warps = NUM_WARPS;
 uint32_t num_cores = NUM_CORES;
-uint32_t num_clusters = NUM_CLUSTERS;
 bool showStats = false;;
 bool riscv_test = false;
 const char* program = nullptr;
 
 static void parse_args(int argc, char **argv) {
   	int c;
-  	while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
+  	while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) {
     	switch (c) {
       case 't':
         num_threads = atoi(optarg);
@@ -51,9 +50,6 @@ static void parse_args(int argc, char **argv) {
         break;
 		  case 'c':
         num_cores = atoi(optarg);
-        break;
-		  case 'g':
-        num_clusters = atoi(optarg);
         break;
       case 'r':
         riscv_test = true;
@@ -88,7 +84,7 @@ int main(int argc, char **argv) {
 
   {
     // create processor configuation
-    Arch arch(num_threads, num_warps, num_cores, num_clusters);
+    Arch arch(num_threads, num_warps, num_cores);
 
     // create memory module
     RAM ram(RAM_PAGE_SIZE);
diff --git a/sim/simx/socket.cpp b/sim/simx/socket.cpp
new file mode 100644
index 00000000..fb620d62
--- /dev/null
+++ b/sim/simx/socket.cpp
@@ -0,0 +1,146 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "socket.h"
+#include "cluster.h"
+
+using namespace vortex;
+
+Socket::Socket(const SimContext& ctx, 
+                uint32_t socket_id,
+                Cluster* cluster, 
+                const Arch &arch, const 
+                DCRS &dcrs) 
+  : SimObject(ctx, "socket")
+  , icache_mem_req_port(this)
+  , icache_mem_rsp_port(this)
+  , dcache_mem_req_port(this)
+  , dcache_mem_rsp_port(this)
+  , socket_id_(socket_id)
+  , cores_(arch.socket_size())
+  , cluster_(cluster)
+{
+  auto cores_per_socket = cores_.size();
+  
+  char sname[100];
+  snprintf(sname, 100, "socket%d-icaches", socket_id);
+  icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
+    !ICACHE_ENABLED,
+    log2ceil(ICACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // L
+    log2ceil(sizeof(uint32_t)), // W
+    log2ceil(ICACHE_NUM_WAYS),// A
+    1,                      // B
+    XLEN,                   // address bits
+    1,                      // number of ports
+    1,                      // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    (uint8_t)arch.num_warps(), // mshr
+    2,                      // pipeline latency
+  });
+
+  icaches_->MemReqPort.bind(&icache_mem_req_port);
+  icache_mem_rsp_port.bind(&icaches_->MemRspPort);
+
+  snprintf(sname, 100, "socket%d-dcaches", socket_id);
+  dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
+    !DCACHE_ENABLED,
+    log2ceil(DCACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // L
+    log2ceil(sizeof(Word)), // W
+    log2ceil(DCACHE_NUM_WAYS),// A
+    log2ceil(DCACHE_NUM_BANKS), // B
+    XLEN,                   // address bits
+    1,                      // number of ports
+    DCACHE_NUM_BANKS,       // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    DCACHE_MSHR_SIZE,       // mshr
+    2,                      // pipeline latency
+  });
+
+  dcaches_->MemReqPort.bind(&dcache_mem_req_port);
+  dcache_mem_rsp_port.bind(&dcaches_->MemRspPort);
+
+  // create cores
+
+  for (uint32_t i = 0; i < cores_per_socket; ++i) {  
+    uint32_t core_id = socket_id * cores_per_socket + i;
+    cores_.at(i) = Core::Create(core_id, this, arch, dcrs);
+
+    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
+    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      
+
+    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
+      cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
+      dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
+    }
+  }
+}
+
+Socket::~Socket() {
+  //--
+}
+
+void Socket::reset() {  
+  //--
+}
+
+void Socket::tick() {
+  //--
+}
+
+void Socket::attach_ram(RAM* ram) {
+  for (auto core : cores_) {
+    core->attach_ram(ram);
+  }
+}
+
+bool Socket::running() const {
+  for (auto& core : cores_) {
+    if (core->running())
+      return true;
+  }
+  return false;
+}
+
+bool Socket::check_exit(Word* exitcode, bool riscv_test) const {
+  bool done = true;
+  Word exitcode_ = 0;
+  for (auto& core : cores_) {
+    Word ec;
+    if (core->check_exit(&ec, riscv_test)) {
+      exitcode_ |= ec;
+    } else {
+      done = false;
+    }
+  }
+  *exitcode = exitcode_;
+  return done;
+}
+
+void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
+  cluster_->barrier(bar_id, count, socket_id_ * cores_.size() + core_id);
+}
+
+void Socket::resume(uint32_t core_index) {
+  cores_.at(core_index)->resume();
+}
+
+Socket::PerfStats Socket::perf_stats() const {
+  Socket::PerfStats perf;
+  perf.icache = icaches_->perf_stats();
+  perf.dcache = dcaches_->perf_stats();  
+  return perf;
+}
\ No newline at end of file
diff --git a/sim/simx/socket.h b/sim/simx/socket.h
new file mode 100644
index 00000000..5c94c31f
--- /dev/null
+++ b/sim/simx/socket.h
@@ -0,0 +1,87 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "dcrs.h"
+#include "arch.h"
+#include "cache_cluster.h"
+#include "shared_mem.h"
+#include "core.h"
+#include "constants.h"
+
+namespace vortex {
+
+class Cluster;
+
+class Socket : public SimObject<Socket> {
+public:
+  struct PerfStats {
+    CacheSim::PerfStats icache;
+    CacheSim::PerfStats dcache;
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->icache += rhs.icache;
+      this->dcache += rhs.dcache;
+      return *this;
+    }
+  };
+
+  SimPort<MemReq> icache_mem_req_port;
+  SimPort<MemRsp> icache_mem_rsp_port;
+
+  SimPort<MemReq> dcache_mem_req_port;
+  SimPort<MemRsp> dcache_mem_rsp_port;
+
+  Socket(const SimContext& ctx, 
+         uint32_t socket_id,
+         Cluster* cluster, 
+         const Arch &arch, 
+         const DCRS &dcrs);
+
+  ~Socket();
+
+  uint32_t id() const {
+    return socket_id_;
+  }
+
+  Cluster* cluster() const {
+    return cluster_;
+  }
+
+  void reset();
+
+  void tick();
+
+  void attach_ram(RAM* ram);
+
+  bool running() const;
+
+  bool check_exit(Word* exitcode, bool riscv_test) const;  
+
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
+
+  void resume(uint32_t core_id);
+
+  Socket::PerfStats perf_stats() const;
+  
+private:
+  uint32_t                socket_id_;  
+  std::vector<Core::Ptr>  cores_;
+  CacheCluster::Ptr       icaches_;
+  CacheCluster::Ptr       dcaches_;
+  Cluster*                cluster_;
+};
+
+} // namespace vortex
\ No newline at end of file
diff --git a/sim/simx/types.h b/sim/simx/types.h
index 6bba7f9c..d3fcfa1a 100644
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
   case RegType::Integer: os << "x"; break;  
   case RegType::Float:   os << "f"; break;
   case RegType::Vector:  os << "v"; break;
+  default: assert(false);
   }
   return os;
 }
@@ -112,6 +113,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
   case AluType::SYSCALL: os << "SYSCALL"; break;
   case AluType::IMUL:    os << "IMUL"; break;
   case AluType::IDIV:    os << "IDIV"; break;
+  default: assert(false);
   }
   return os;
 }
@@ -129,6 +131,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
   case LsuType::LOAD:  os << "LOAD"; break;
   case LsuType::STORE: os << "STORE"; break;
   case LsuType::FENCE: os << "FENCE"; break;
+  default: assert(false);
   }
   return os;
 }
@@ -146,6 +149,7 @@ inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
   case AddrType::Global: os << "Global"; break;
   case AddrType::Shared: os << "Shared"; break;
   case AddrType::IO:     os << "IO"; break;
+  default: assert(false);
   }
   return os;
 }
@@ -174,6 +178,7 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
   case FpuType::FDIV:  os << "FDIV"; break;
   case FpuType::FSQRT: os << "FSQRT"; break;
   case FpuType::FCVT:  os << "FCVT"; break;
+  default: assert(false);
   }
   return os;
 }
@@ -205,6 +210,7 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
   case SfuType::CSRRS:  os << "CSRRS"; break;
   case SfuType::CSRRC:  os << "CSRRC"; break;
   case SfuType::CMOV:   os << "CMOV"; break;
+  default: assert(false);
   }
   return os;
 }
@@ -220,6 +226,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
   switch (type) {
   case ArbiterType::Priority:   os << "Priority"; break;
   case ArbiterType::RoundRobin: os << "RoundRobin"; break;
+  default: assert(false);
   }
   return os;
 }