diff --git a/hw/VX_config.h b/hw/VX_config.h index c89f5b4e..f809ab82 100644 --- a/hw/VX_config.h +++ b/hw/VX_config.h @@ -84,7 +84,7 @@ #endif #ifndef NUM_CORES -#define NUM_CORES 2 +#define NUM_CORES 4 #endif #ifndef NUM_WARPS @@ -96,7 +96,7 @@ #endif #ifndef NUM_BARRIERS -#define NUM_BARRIERS 4 +#define NUM_BARRIERS 8 #endif #ifndef SOCKET_SIZE diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 8905bd3d..69594848 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -83,7 +83,7 @@ `endif `ifndef NUM_CORES -`define NUM_CORES 2 +`define NUM_CORES 4 `endif `ifndef NUM_WARPS @@ -250,7 +250,7 @@ `define NUM_ALU_LANES `NUM_THREADS `endif `ifndef NUM_ALU_BLOCKS -`define NUM_ALU_BLOCKS 4 +`define NUM_ALU_BLOCKS 2 `endif // Number of FPU units @@ -258,7 +258,7 @@ `define NUM_FPU_LANES `NUM_THREADS `endif `ifndef NUM_FPU_BLOCKS -`define NUM_FPU_BLOCKS 2 +`define NUM_FPU_BLOCKS 1 `endif // Number of LSU units diff --git a/hw/rtl/VX_core_wrapper.sv b/hw/rtl/VX_core_wrapper.sv index 626a3cb1..1ac02d60 100644 --- a/hw/rtl/VX_core_wrapper.sv +++ b/hw/rtl/VX_core_wrapper.sv @@ -495,20 +495,30 @@ module Vortex import VX_gpu_pkg::*; #( // .busy(busy) // ); + logic [31:0] finish_counter; + + always @(posedge clock) begin + if (reset) begin + finish_counter <= 32'd0; + end else begin + if (finished) begin + finish_counter <= finish_counter + 32'd1; + end + end + end + + // give slack for other cores to finish + wire all_cores_finished = (finish_counter > 32'd10000); + +`ifdef SIMULATION always @(posedge clock) begin if (!reset) begin + if ((CORE_ID == '0) && all_cores_finished) begin + $display("simulation has probably ended. exiting"); + $finish(); + end if (finished) begin - `ifdef SIMULATION - $display("---------------- core%2d has no more active warps ----------------", CORE_ID); - $display("simulation has ended. exiting"); - $finish(); - `endif - // `ifdef SIMULATION - // if ($time >= 60000) begin - // $display("simulation has probably ended. exiting"); - // @(posedge clock) $finish(); - // end - // `endif + $display("---------------- core%2d has no more active warps ----------------", CORE_ID); // TODO: lane assumed to be 4 // `ifndef SYNTHESIS // for (integer j = 0; j < `NUM_WARPS; j++) begin @@ -525,6 +535,7 @@ module Vortex import VX_gpu_pkg::*; #( end end end +`endif endmodule : Vortex diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 223b34f1..0dbce176 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -38,6 +38,7 @@ `ifdef SYNTHESIS `define FPU_FPNEW +// `define FIRESIM `endif // SYNTHESIS `ifdef SV_DPI @@ -78,7 +79,7 @@ `define UNUSED_PIN(x) . x () `define UNUSED_ARG(x) x `define TRACE(level, args) $write args -`else +`else // !SYNTHESIS `ifdef VERILATOR `define SIMULATION `define TRACING_ON /* verilator tracing_on */ @@ -207,6 +208,7 @@ x \ /* verilator lint_on UNUSED */ `define TRACE(level, args) $write args +// `define TRACE(level, args) dpi_trace(level, $sformatf args) `endif `endif diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv index 587176cf..a90efab3 100644 --- a/hw/rtl/core/VX_operands_dup.sv +++ b/hw/rtl/core/VX_operands_dup.sv @@ -30,6 +30,11 @@ module VX_operands_dup import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO); +`ifdef PERF_ENABLE + logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_read_per_warp; + logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_write_per_warp; +`endif + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin VX_stream_buffer #( .DATAW (DATAW) @@ -150,6 +155,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #( end `endif +`ifdef PERF_ENABLE + logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs1_per_thread; + logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs2_per_thread; + logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs3_per_thread; +`endif + for (genvar j = 0; j < `NUM_THREADS; ++j) begin VX_dp_ram #( .DATAW (`XLEN), @@ -219,9 +230,61 @@ module VX_operands_dup import VX_gpu_pkg::*; #( .raddr (gpr_rd_addr_rs3), .rdata (rs3_data[j]) ); + +`ifdef PERF_ENABLE + assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]); + assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]); + assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]); +`endif + end + +`ifdef PERF_ENABLE + // read is done for all threads; write is masked + wire scoreboard_fire = scoreboard_if[i].valid && scoreboard_if[i].ready; + wire [`PERF_CTR_BITS-1:0] perf_read_rs1_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0); + wire [`PERF_CTR_BITS-1:0] perf_read_rs2_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0); + wire [`PERF_CTR_BITS-1:0] perf_read_rs3_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0); + assign perf_rf_read_per_warp[i] = perf_read_rs1_per_warp + perf_read_rs2_per_warp + perf_read_rs3_per_warp; + + always @(*) begin + perf_rf_write_per_warp[i] = '0; + for (integer t = 0; t < `NUM_THREADS; ++t) begin + perf_rf_write_per_warp[i] = perf_rf_write_per_warp[i] + + perf_write_rs1_per_thread[t] + + perf_write_rs2_per_thread[t] + + perf_write_rs3_per_thread[t]; + end + end +`endif + end + +`ifdef PERF_ENABLE + logic [`PERF_CTR_BITS-1:0] perf_rf_read_per_cycle; + logic [`PERF_CTR_BITS-1:0] perf_rf_write_per_cycle; + + always @(*) begin + perf_rf_read_per_cycle = '0; + perf_rf_write_per_cycle = '0; + for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin + perf_rf_read_per_cycle = perf_rf_read_per_cycle + perf_rf_read_per_warp[i]; + perf_rf_write_per_cycle = perf_rf_write_per_cycle + perf_rf_write_per_warp[i]; end end + logic [`PERF_CTR_BITS-1:0] perf_rf_reads; + logic [`PERF_CTR_BITS-1:0] perf_rf_writes; + + always @(posedge clk) begin + if (reset) begin + perf_rf_reads <= '0; + perf_rf_writes <= '0; + end else begin + perf_rf_reads <= perf_rf_reads + perf_rf_read_per_cycle; + perf_rf_writes <= perf_rf_writes + perf_rf_write_per_cycle; + end + end +`endif + endmodule `endif diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 53a11ffb..1b3cd5ee 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -280,6 +280,8 @@ module VX_schedule import VX_gpu_pkg::*; #( assign gbar_bus_if.req_valid = gbar_req_valid; assign gbar_bus_if.req_id = gbar_req_id; assign gbar_bus_if.req_size_m1 = gbar_req_size_m1; + // NOTE(hansung): since CORE_ID is global across multiple clusters, we + // need the modulo to get the per-cluster local core id assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES); `endif diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index efa74afd..cd1bc8c9 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -12,8 +12,9 @@ module VX_tensor_core import VX_gpu_pkg::*; #( ); localparam BLOCK_SIZE = 1; localparam NUM_LANES = `NUM_THREADS; - // localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); - localparam PARTIAL_BW = 1; + // FIXME: @perf: PARTIAL_BW==1 increases power instantiating + // stream_buffers for ISSUE_WIDTH times + localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); VX_execute_if #( .NUM_LANES (NUM_LANES) @@ -410,6 +411,7 @@ module VX_tensor_octet #( substeps_n = substeps; if (operands_first_in_pair_fire) begin + // NOTE: substeps is only used for debugging substeps_n[operands_wid_buf] = 1'b1; // ready for hmma A_buffer_n[operands_wid_buf] = halves_buf.A_half; B_buffer_n[operands_wid_buf] = halves_buf.B_half; @@ -495,7 +497,7 @@ module VX_tensor_octet #( wire outbuf_enq = outbuf_ready_in && dpu_valid; wire outbuf_deq = result_valid && result_ready; - // buffer to stage the result D tile for 2 cycles until commit/writeback + // result buffer to stage the D tile for 2 cycles until commit/writeback // is complete. This decouples the irregular dpu output traffic from the // regular, every-2-cycle commit traffic to ensure the commit pipeline is // used more efficiently.