diff --git a/hw/VX_config.h b/hw/VX_config.h
index c89f5b4e..f809ab82 100644
--- a/hw/VX_config.h
+++ b/hw/VX_config.h
@@ -84,7 +84,7 @@
 #endif
 
 #ifndef NUM_CORES
-#define NUM_CORES 2
+#define NUM_CORES 4
 #endif
 
 #ifndef NUM_WARPS
@@ -96,7 +96,7 @@
 #endif
 
 #ifndef NUM_BARRIERS
-#define NUM_BARRIERS 4
+#define NUM_BARRIERS 8
 #endif
 
 #ifndef SOCKET_SIZE
diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 8905bd3d..69594848 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -83,7 +83,7 @@
 `endif
 
 `ifndef NUM_CORES
-`define NUM_CORES 2
+`define NUM_CORES 4
 `endif
 
 `ifndef NUM_WARPS
@@ -250,7 +250,7 @@
 `define NUM_ALU_LANES   `NUM_THREADS
 `endif
 `ifndef NUM_ALU_BLOCKS
-`define NUM_ALU_BLOCKS  4
+`define NUM_ALU_BLOCKS  2
 `endif
 
 // Number of FPU units
@@ -258,7 +258,7 @@
 `define NUM_FPU_LANES   `NUM_THREADS
 `endif
 `ifndef NUM_FPU_BLOCKS
-`define NUM_FPU_BLOCKS  2
+`define NUM_FPU_BLOCKS  1
 `endif
 
 // Number of LSU units
diff --git a/hw/rtl/VX_core_wrapper.sv b/hw/rtl/VX_core_wrapper.sv
index 626a3cb1..1ac02d60 100644
--- a/hw/rtl/VX_core_wrapper.sv
+++ b/hw/rtl/VX_core_wrapper.sv
@@ -495,20 +495,30 @@ module Vortex import VX_gpu_pkg::*; #(
     //     .busy(busy)
     // );
 
+    logic [31:0] finish_counter;
+
+    always @(posedge clock) begin
+      if (reset) begin
+        finish_counter <= 32'd0;
+      end else begin
+        if (finished) begin
+          finish_counter <= finish_counter + 32'd1;
+        end
+      end
+    end
+
+    // give slack for other cores to finish
+    wire all_cores_finished = (finish_counter > 32'd10000);
+
+`ifdef SIMULATION
     always @(posedge clock) begin
         if (!reset) begin
+            if ((CORE_ID == '0) && all_cores_finished) begin
+                $display("simulation has probably ended. exiting");
+                $finish();
+            end
             if (finished) begin
-                `ifdef SIMULATION
-                    $display("---------------- core%2d has no more active warps ----------------", CORE_ID);
-                    $display("simulation has ended. exiting");
-                    $finish();
-                `endif
-                // `ifdef SIMULATION
-                // if ($time >= 60000) begin
-                //   $display("simulation has probably ended. exiting");
-                //   @(posedge clock) $finish();
-                // end
-                // `endif
+                $display("---------------- core%2d has no more active warps ----------------", CORE_ID);
                 // TODO: lane assumed to be 4
                 // `ifndef SYNTHESIS
                 // for (integer j = 0; j < `NUM_WARPS; j++) begin
@@ -525,6 +535,7 @@ module Vortex import VX_gpu_pkg::*; #(
             end
         end
     end
+`endif
 
 endmodule : Vortex
 
diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh
index 223b34f1..0dbce176 100644
--- a/hw/rtl/VX_platform.vh
+++ b/hw/rtl/VX_platform.vh
@@ -38,6 +38,7 @@
 
 `ifdef SYNTHESIS
 `define FPU_FPNEW
+// `define FIRESIM
 `endif // SYNTHESIS
 
 `ifdef SV_DPI
@@ -78,7 +79,7 @@
 `define UNUSED_PIN(x) . x ()
 `define UNUSED_ARG(x) x
 `define TRACE(level, args) $write args
-`else
+`else // !SYNTHESIS
 `ifdef VERILATOR
 `define SIMULATION
 `define TRACING_ON      /* verilator tracing_on */
@@ -207,6 +208,7 @@
                         x \
                         /* verilator lint_on UNUSED */
 `define TRACE(level, args) $write args
+// `define TRACE(level, args) dpi_trace(level, $sformatf args)
 `endif
 `endif
 
diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv
index 587176cf..a90efab3 100644
--- a/hw/rtl/core/VX_operands_dup.sv
+++ b/hw/rtl/core/VX_operands_dup.sv
@@ -30,6 +30,11 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
     localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
     localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
 
+`ifdef PERF_ENABLE
+    logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_read_per_warp;
+    logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_write_per_warp;
+`endif
+
     for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
         VX_stream_buffer #(
             .DATAW (DATAW)
@@ -150,6 +155,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
         end
     `endif
 
+`ifdef PERF_ENABLE
+        logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs1_per_thread;
+        logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs2_per_thread;
+        logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs3_per_thread;
+`endif
+
         for (genvar j = 0; j < `NUM_THREADS; ++j) begin
             VX_dp_ram #(
                 .DATAW (`XLEN),
@@ -219,9 +230,61 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
                 .raddr (gpr_rd_addr_rs3),
                 .rdata (rs3_data[j])
             );
+
+`ifdef PERF_ENABLE
+            assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
+            assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
+            assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
+`endif
+        end
+
+`ifdef PERF_ENABLE
+        // read is done for all threads; write is masked
+        wire scoreboard_fire = scoreboard_if[i].valid && scoreboard_if[i].ready;
+        wire [`PERF_CTR_BITS-1:0] perf_read_rs1_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
+        wire [`PERF_CTR_BITS-1:0] perf_read_rs2_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
+        wire [`PERF_CTR_BITS-1:0] perf_read_rs3_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
+        assign perf_rf_read_per_warp[i] = perf_read_rs1_per_warp + perf_read_rs2_per_warp + perf_read_rs3_per_warp;
+
+        always @(*) begin
+            perf_rf_write_per_warp[i] = '0;
+            for (integer t = 0; t < `NUM_THREADS; ++t) begin
+                perf_rf_write_per_warp[i] = perf_rf_write_per_warp[i] + 
+                                            perf_write_rs1_per_thread[t] +
+                                            perf_write_rs2_per_thread[t] +
+                                            perf_write_rs3_per_thread[t];
+            end
+        end
+`endif
+    end
+
+`ifdef PERF_ENABLE
+    logic [`PERF_CTR_BITS-1:0] perf_rf_read_per_cycle;
+    logic [`PERF_CTR_BITS-1:0] perf_rf_write_per_cycle;
+
+    always @(*) begin
+        perf_rf_read_per_cycle = '0;
+        perf_rf_write_per_cycle = '0;
+        for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
+            perf_rf_read_per_cycle = perf_rf_read_per_cycle + perf_rf_read_per_warp[i];
+            perf_rf_write_per_cycle = perf_rf_write_per_cycle + perf_rf_write_per_warp[i];
         end
     end
 
+    logic [`PERF_CTR_BITS-1:0] perf_rf_reads;
+    logic [`PERF_CTR_BITS-1:0] perf_rf_writes;
+
+    always @(posedge clk) begin
+        if (reset) begin
+            perf_rf_reads <= '0;
+            perf_rf_writes <= '0;
+        end else begin
+            perf_rf_reads  <= perf_rf_reads  + perf_rf_read_per_cycle;
+            perf_rf_writes <= perf_rf_writes + perf_rf_write_per_cycle;
+        end
+    end
+`endif
+
 endmodule
 
 `endif
diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv
index 53a11ffb..1b3cd5ee 100644
--- a/hw/rtl/core/VX_schedule.sv
+++ b/hw/rtl/core/VX_schedule.sv
@@ -280,6 +280,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
     assign gbar_bus_if.req_valid   = gbar_req_valid;
     assign gbar_bus_if.req_id      = gbar_req_id;
     assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
+    // NOTE(hansung): since CORE_ID is global across multiple clusters, we
+    // need the modulo to get the per-cluster local core id
     assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
 `endif
 
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index efa74afd..cd1bc8c9 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -12,8 +12,9 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
 );
     localparam BLOCK_SIZE = 1;
     localparam NUM_LANES  = `NUM_THREADS;
-    // localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
-    localparam PARTIAL_BW = 1;
+    // FIXME: @perf: PARTIAL_BW==1 increases power instantiating
+    // stream_buffers for ISSUE_WIDTH times
+    localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
 
     VX_execute_if #(
         .NUM_LANES (NUM_LANES)
@@ -410,6 +411,7 @@ module VX_tensor_octet #(
         substeps_n = substeps;
         
         if (operands_first_in_pair_fire) begin
+          // NOTE: substeps is only used for debugging
           substeps_n[operands_wid_buf] = 1'b1; // ready for hmma
           A_buffer_n[operands_wid_buf] = halves_buf.A_half;
           B_buffer_n[operands_wid_buf] = halves_buf.B_half;
@@ -495,7 +497,7 @@ module VX_tensor_octet #(
     wire outbuf_enq = outbuf_ready_in && dpu_valid;
     wire outbuf_deq = result_valid && result_ready;
 
-    // buffer to stage the result D tile for 2 cycles until commit/writeback
+    // result buffer to stage the D tile for 2 cycles until commit/writeback
     // is complete.  This decouples the irregular dpu output traffic from the
     // regular, every-2-cycle commit traffic to ensure the commit pipeline is
     // used more efficiently.