diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index ee0c493b..3747502f 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -47,8 +47,6 @@ module VX_operands import VX_gpu_pkg::*; #( reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0]; reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n; - reg valid_out_r; - reg [DATAW-1:0] data_out_r; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; @@ -60,7 +58,7 @@ module VX_operands import VX_gpu_pkg::*; #( reg rs3_ready, rs3_ready_n; reg data_ready, data_ready_n; - wire ready_out = operands_if[i].ready; + wire stg_valid_in, stg_ready_in; wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0); wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0); @@ -85,7 +83,7 @@ module VX_operands import VX_gpu_pkg::*; #( case (state) STATE_IDLE: begin - if (valid_out_r && ready_out) begin + if (operands_if[i].valid && operands_if[i].ready) begin data_ready_n = 0; end if (scoreboard_if[i].valid && data_ready_n == 0) begin @@ -173,37 +171,15 @@ module VX_operands import VX_gpu_pkg::*; #( end always @(posedge clk) begin - if (reset) begin + if (reset) begin state <= STATE_IDLE; cache_eop <= {ISSUE_RATIO{1'b1}}; data_ready <= 0; - valid_out_r <= 0; end else begin state <= state_n; cache_eop <= cache_eop_n; - data_ready <= data_ready_n; - if (~valid_out_r) begin - valid_out_r <= scoreboard_if[i].valid && data_ready; - end else if (ready_out) begin - valid_out_r <= 0; - end + data_ready <= data_ready_n; end - - if (~valid_out_r) begin - data_out_r <= {scoreboard_if[i].data.uuid, - scoreboard_if[i].data.wis, - scoreboard_if[i].data.tmask, - scoreboard_if[i].data.PC, - scoreboard_if[i].data.wb, - scoreboard_if[i].data.ex_type, - scoreboard_if[i].data.op_type, - scoreboard_if[i].data.op_mod, - scoreboard_if[i].data.use_PC, - scoreboard_if[i].data.use_imm, - scoreboard_if[i].data.imm, - scoreboard_if[i].data.rd}; - end - gpr_rd_rid <= gpr_rd_rid_n; gpr_rd_wis <= gpr_rd_wis_n; rs2_ready <= rs2_ready_n; @@ -216,10 +192,35 @@ module VX_operands import VX_gpu_pkg::*; #( cache_data <= cache_data_n; cache_reg <= cache_reg_n; cache_tmask <= cache_tmask_n; - end + end - assign operands_if[i].valid = valid_out_r; - assign {operands_if[i].data.uuid, + assign stg_valid_in = scoreboard_if[i].valid && data_ready; + assign scoreboard_if[i].ready = stg_ready_in && data_ready; + + VX_toggle_buffer #( + .DATAW (DATAW) + ) staging_buffer ( + .clk (clk), + .reset (reset), + .valid_in (stg_valid_in), + .data_in ({ + scoreboard_if[i].data.uuid, + scoreboard_if[i].data.wis, + scoreboard_if[i].data.tmask, + scoreboard_if[i].data.PC, + scoreboard_if[i].data.wb, + scoreboard_if[i].data.ex_type, + scoreboard_if[i].data.op_type, + scoreboard_if[i].data.op_mod, + scoreboard_if[i].data.use_PC, + scoreboard_if[i].data.use_imm, + scoreboard_if[i].data.imm, + scoreboard_if[i].data.rd + }), + .ready_in (stg_ready_in), + .valid_out (operands_if[i].valid), + .data_out ({ + operands_if[i].data.uuid, operands_if[i].data.wis, operands_if[i].data.tmask, operands_if[i].data.PC, @@ -230,13 +231,15 @@ module VX_operands import VX_gpu_pkg::*; #( operands_if[i].data.use_PC, operands_if[i].data.use_imm, operands_if[i].data.imm, - operands_if[i].data.rd} = data_out_r; + operands_if[i].data.rd + }), + .ready_out (operands_if[i].ready) + ); + assign operands_if[i].data.rs1_data = rs1_data; assign operands_if[i].data.rs2_data = rs2_data; assign operands_if[i].data.rs3_data = rs3_data; - assign scoreboard_if[i].ready = ~valid_out_r && data_ready; - // GPR banks reg [RAM_ADDRW-1:0] gpr_rd_addr; diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index a4792c8d..6b806dd0 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -152,51 +152,47 @@ module VX_scoreboard import VX_gpu_pkg::*; #( assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready; `endif - reg [DATAW-1:0] data_out_r; - reg valid_out_r; - wire ready_out; + wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; + wire operands_ready = ~(| operands_busy); + + wire stg_valid_in, stg_ready_in; + assign stg_valid_in = ibuffer_if[i].valid && operands_ready; + assign ibuffer_if[i].ready = stg_ready_in && operands_ready; - wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; - wire deps_ready = (& ready_masks); - - wire valid_in = ibuffer_if[i].valid && deps_ready; - wire ready_in = ~valid_out_r && deps_ready; - wire [DATAW-1:0] data_in = ibuffer_if[i].data; - - assign ready_out = scoreboard_if[i].ready; + VX_stream_buffer #( + .DATAW (DATAW) + ) staging_buffer ( + .clk (clk), + .reset (reset), + .valid_in (stg_valid_in), + .data_in (ibuffer_if[i].data), + .ready_in (stg_ready_in), + .valid_out (scoreboard_if[i].valid), + .data_out (scoreboard_if[i].data), + .ready_out (scoreboard_if[i].ready) + ); always @(posedge clk) begin if (reset) begin - valid_out_r <= 0; inuse_regs <= '0; end else begin if (writeback_fire) begin inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; end - if (~valid_out_r) begin - valid_out_r <= valid_in; - end else if (ready_out) begin - if (scoreboard_if[i].data.wb) begin - inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1; - `ifdef PERF_ENABLE - inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type; - if (scoreboard_if[i].data.ex_type == `EX_SFU) begin - inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type; - end - `endif - end - valid_out_r <= 0; + if (stg_valid_in && stg_ready_in && ibuffer_if[i].data.wb) begin + inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1; end end - if (~valid_out_r) begin - data_out_r <= data_in; + `ifdef PERF_ENABLE + if (stg_valid_in && stg_ready_in && ibuffer_if[i].data.wb) begin + inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= ibuffer_if[i].data.ex_type; + if (ibuffer_if[i].data.ex_type == `EX_SFU) begin + inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= sfu_type; + end end + `endif end - assign ibuffer_if[i].ready = ready_in; - assign scoreboard_if[i].valid = valid_out_r; - assign scoreboard_if[i].data = data_out_r; - `ifdef SIMULATION reg [31:0] timeout_ctr; @@ -208,7 +204,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `ifdef DBG_TRACE_CORE_PIPELINE `TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr, - ~ready_masks, ibuffer_if[i].data.uuid)); + operands_busy, ibuffer_if[i].data.uuid)); `endif timeout_ctr <= timeout_ctr + 1; end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin @@ -220,7 +216,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT), ("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)", $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr, - ~ready_masks, ibuffer_if[i].data.uuid)); + operands_busy, ibuffer_if[i].data.uuid)); `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0, ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",