diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 28f1edf1..c5182ff7 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -13,6 +13,8 @@ `include "VX_define.vh" +`ifndef GPR_DUPLICATED + module VX_operands import VX_gpu_pkg::*; #( parameter CORE_ID = 0, parameter CACHE_ENABLE = 0 @@ -197,9 +199,10 @@ module VX_operands import VX_gpu_pkg::*; #( assign stg_valid_in = scoreboard_if[i].valid && data_ready; assign scoreboard_if[i].ready = stg_ready_in && data_ready; - // NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving - // throughput. Wouldn't this cap overall IPC? Or OK as long as - // ISSUE_WIDTH > 1? + // NOTE(hansung): Cannot use stream_buffer here for full throughput + // because data registers (rs1_data, ...) are single-buffered. This + // will probably cap IPC at 50% (notwithstanding the 1-operand-per-cycle + // limit.) VX_toggle_buffer #( .DATAW (DATAW) ) staging_buffer ( @@ -295,3 +298,5 @@ module VX_operands import VX_gpu_pkg::*; #( end endmodule + +`endif diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv index 283011df..b43f0976 100644 --- a/hw/rtl/core/VX_operands_dup.sv +++ b/hw/rtl/core/VX_operands_dup.sv @@ -13,6 +13,8 @@ `include "VX_define.vh" +`ifdef GPR_DUPLICATED + module VX_operands_dup import VX_gpu_pkg::*; #( parameter CORE_ID = 0, parameter CACHE_ENABLE = 0 @@ -224,3 +226,5 @@ module VX_operands_dup import VX_gpu_pkg::*; #( end endmodule + +`endif