diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv
index 28f1edf1..c5182ff7 100644
--- a/hw/rtl/core/VX_operands.sv
+++ b/hw/rtl/core/VX_operands.sv
@@ -13,6 +13,8 @@
 
 `include "VX_define.vh"
 
+`ifndef GPR_DUPLICATED
+
 module VX_operands import VX_gpu_pkg::*; #(
     parameter CORE_ID = 0,
     parameter CACHE_ENABLE = 0
@@ -197,9 +199,10 @@ module VX_operands import VX_gpu_pkg::*; #(
         assign stg_valid_in = scoreboard_if[i].valid && data_ready;
         assign scoreboard_if[i].ready = stg_ready_in && data_ready;        
 
-        // NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving
-        // throughput.  Wouldn't this cap overall IPC?  Or OK as long as
-        // ISSUE_WIDTH > 1?
+        // NOTE(hansung): Cannot use stream_buffer here for full throughput
+        // because data registers (rs1_data, ...) are single-buffered.  This
+        // will probably cap IPC at 50% (notwithstanding the 1-operand-per-cycle
+        // limit.)
         VX_toggle_buffer #(
             .DATAW (DATAW)
         ) staging_buffer (
@@ -295,3 +298,5 @@ module VX_operands import VX_gpu_pkg::*; #(
     end
 
 endmodule
+
+`endif
diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv
index 283011df..b43f0976 100644
--- a/hw/rtl/core/VX_operands_dup.sv
+++ b/hw/rtl/core/VX_operands_dup.sv
@@ -13,6 +13,8 @@
 
 `include "VX_define.vh"
 
+`ifdef GPR_DUPLICATED
+
 module VX_operands_dup import VX_gpu_pkg::*; #(
     parameter CORE_ID = 0,
     parameter CACHE_ENABLE = 0
@@ -224,3 +226,5 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
     end
 
 endmodule
+
+`endif