Merge branch 'rtl' of https://github.com/hansungk/vortex-private into rtl

2024-07-20 23:37:58 -07:00
parent 7d422cc9b0 14b811f334
commit ed247e21bb
7 changed files with 100 additions and 20 deletions
--- a/hw/VX_config.h
+++ b/hw/VX_config.h
@@ -84,7 +84,7 @@
 #endif

 #ifndef NUM_CORES
-#define NUM_CORES 2
+#define NUM_CORES 4
 #endif

 #ifndef NUM_WARPS
@@ -96,7 +96,7 @@
 #endif

 #ifndef NUM_BARRIERS
-#define NUM_BARRIERS 4
+#define NUM_BARRIERS 8
 #endif

 #ifndef SOCKET_SIZE
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -83,7 +83,7 @@
 `endif

 `ifndef NUM_CORES
-`define NUM_CORES 2
+`define NUM_CORES 4
 `endif

 `ifndef NUM_WARPS
@@ -250,7 +250,7 @@
 `define NUM_ALU_LANES   `NUM_THREADS
 `endif
 `ifndef NUM_ALU_BLOCKS
-`define NUM_ALU_BLOCKS  4
+`define NUM_ALU_BLOCKS  2
 `endif

 // Number of FPU units
@@ -258,7 +258,7 @@
 `define NUM_FPU_LANES   `NUM_THREADS
 `endif
 `ifndef NUM_FPU_BLOCKS
-`define NUM_FPU_BLOCKS  2
+`define NUM_FPU_BLOCKS  1
 `endif

 // Number of LSU units
--- a/hw/rtl/VX_core_wrapper.sv
+++ b/hw/rtl/VX_core_wrapper.sv
@@ -495,20 +495,30 @@ module Vortex import VX_gpu_pkg::*; #(
    //     .busy(busy)
    // );

+    logic [31:0] finish_counter;
+
+    always @(posedge clock) begin
+      if (reset) begin
+        finish_counter <= 32'd0;
+      end else begin
+        if (finished) begin
+          finish_counter <= finish_counter + 32'd1;
+        end
+      end
+    end
+
+    // give slack for other cores to finish
+    wire all_cores_finished = (finish_counter > 32'd10000);
+
+`ifdef SIMULATION
    always @(posedge clock) begin
        if (!reset) begin
+            if ((CORE_ID == '0) && all_cores_finished) begin
+                $display("simulation has probably ended. exiting");
+                $finish();
+            end
            if (finished) begin
-                `ifdef SIMULATION
-                    $display("---------------- core%2d has no more active warps ----------------", CORE_ID);
-                    $display("simulation has ended. exiting");
-                    $finish();
-                `endif
-                // `ifdef SIMULATION
-                // if ($time >= 60000) begin
-                //   $display("simulation has probably ended. exiting");
-                //   @(posedge clock) $finish();
-                // end
-                // `endif
+                $display("---------------- core%2d has no more active warps ----------------", CORE_ID);
                // TODO: lane assumed to be 4
                // `ifndef SYNTHESIS
                // for (integer j = 0; j < `NUM_WARPS; j++) begin
@@ -525,6 +535,7 @@ module Vortex import VX_gpu_pkg::*; #(
            end
        end
    end
+`endif

 endmodule : Vortex

--- a/hw/rtl/VX_platform.vh
+++ b/hw/rtl/VX_platform.vh
@@ -38,6 +38,7 @@

 `ifdef SYNTHESIS
 `define FPU_FPNEW
+// `define FIRESIM
 `endif // SYNTHESIS

 `ifdef SV_DPI
@@ -78,7 +79,7 @@
 `define UNUSED_PIN(x) . x ()
 `define UNUSED_ARG(x) x
 `define TRACE(level, args) $write args
-`else
+`else // !SYNTHESIS
 `ifdef VERILATOR
 `define SIMULATION
 `define TRACING_ON      /* verilator tracing_on */
@@ -207,6 +208,7 @@
                        x \
                        /* verilator lint_on UNUSED */
 `define TRACE(level, args) $write args
+// `define TRACE(level, args) dpi_trace(level, $sformatf args)
 `endif
 `endif

--- a/hw/rtl/core/VX_operands_dup.sv
+++ b/hw/rtl/core/VX_operands_dup.sv
@@ -30,6 +30,11 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
    localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
    localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);

+`ifdef PERF_ENABLE
+    logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_read_per_warp;
+    logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_write_per_warp;
+`endif
+
    for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
        VX_stream_buffer #(
            .DATAW (DATAW)
@@ -150,6 +155,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
        end
    `endif

+`ifdef PERF_ENABLE
+        logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs1_per_thread;
+        logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs2_per_thread;
+        logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs3_per_thread;
+`endif
+
        for (genvar j = 0; j < `NUM_THREADS; ++j) begin
            VX_dp_ram #(
                .DATAW (`XLEN),
@@ -219,9 +230,61 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
                .raddr (gpr_rd_addr_rs3),
                .rdata (rs3_data[j])
            );
+
+`ifdef PERF_ENABLE
+            assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
+            assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
+            assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
+`endif
+        end
+
+`ifdef PERF_ENABLE
+        // read is done for all threads; write is masked
+        wire scoreboard_fire = scoreboard_if[i].valid && scoreboard_if[i].ready;
+        wire [`PERF_CTR_BITS-1:0] perf_read_rs1_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
+        wire [`PERF_CTR_BITS-1:0] perf_read_rs2_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
+        wire [`PERF_CTR_BITS-1:0] perf_read_rs3_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
+        assign perf_rf_read_per_warp[i] = perf_read_rs1_per_warp + perf_read_rs2_per_warp + perf_read_rs3_per_warp;
+
+        always @(*) begin
+            perf_rf_write_per_warp[i] = '0;
+            for (integer t = 0; t < `NUM_THREADS; ++t) begin
+                perf_rf_write_per_warp[i] = perf_rf_write_per_warp[i] + 
+                                            perf_write_rs1_per_thread[t] +
+                                            perf_write_rs2_per_thread[t] +
+                                            perf_write_rs3_per_thread[t];
+            end
+        end
+`endif
+    end
+
+`ifdef PERF_ENABLE
+    logic [`PERF_CTR_BITS-1:0] perf_rf_read_per_cycle;
+    logic [`PERF_CTR_BITS-1:0] perf_rf_write_per_cycle;
+
+    always @(*) begin
+        perf_rf_read_per_cycle = '0;
+        perf_rf_write_per_cycle = '0;
+        for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
+            perf_rf_read_per_cycle = perf_rf_read_per_cycle + perf_rf_read_per_warp[i];
+            perf_rf_write_per_cycle = perf_rf_write_per_cycle + perf_rf_write_per_warp[i];
        end
    end

+    logic [`PERF_CTR_BITS-1:0] perf_rf_reads;
+    logic [`PERF_CTR_BITS-1:0] perf_rf_writes;
+
+    always @(posedge clk) begin
+        if (reset) begin
+            perf_rf_reads <= '0;
+            perf_rf_writes <= '0;
+        end else begin
+            perf_rf_reads  <= perf_rf_reads  + perf_rf_read_per_cycle;
+            perf_rf_writes <= perf_rf_writes + perf_rf_write_per_cycle;
+        end
+    end
+`endif
+
 endmodule

 `endif
--- a/hw/rtl/core/VX_schedule.sv
+++ b/hw/rtl/core/VX_schedule.sv
@@ -280,6 +280,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
    assign gbar_bus_if.req_valid   = gbar_req_valid;
    assign gbar_bus_if.req_id      = gbar_req_id;
    assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
+    // NOTE(hansung): since CORE_ID is global across multiple clusters, we
+    // need the modulo to get the per-cluster local core id
    assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
 `endif

--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -12,8 +12,9 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
 );
    localparam BLOCK_SIZE = 1;
    localparam NUM_LANES  = `NUM_THREADS;
-    // localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
-    localparam PARTIAL_BW = 1;
+    // FIXME: @perf: PARTIAL_BW==1 increases power instantiating
+    // stream_buffers for ISSUE_WIDTH times
+    localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);

    VX_execute_if #(
        .NUM_LANES (NUM_LANES)
@@ -410,6 +411,7 @@ module VX_tensor_octet #(
        substeps_n = substeps;
        
        if (operands_first_in_pair_fire) begin
+          // NOTE: substeps is only used for debugging
          substeps_n[operands_wid_buf] = 1'b1; // ready for hmma
          A_buffer_n[operands_wid_buf] = halves_buf.A_half;
          B_buffer_n[operands_wid_buf] = halves_buf.B_half;
@@ -495,7 +497,7 @@ module VX_tensor_octet #(
    wire outbuf_enq = outbuf_ready_in && dpu_valid;
    wire outbuf_deq = result_valid && result_ready;

-    // buffer to stage the result D tile for 2 cycles until commit/writeback
+    // result buffer to stage the D tile for 2 cycles until commit/writeback
    // is complete.  This decouples the irregular dpu output traffic from the
    // regular, every-2-cycle commit traffic to ensure the commit pipeline is
    // used more efficiently.