This commit is contained in:
Richard Yan
2024-07-20 23:37:58 -07:00
7 changed files with 100 additions and 20 deletions

View File

@@ -84,7 +84,7 @@
#endif
#ifndef NUM_CORES
#define NUM_CORES 2
#define NUM_CORES 4
#endif
#ifndef NUM_WARPS
@@ -96,7 +96,7 @@
#endif
#ifndef NUM_BARRIERS
#define NUM_BARRIERS 4
#define NUM_BARRIERS 8
#endif
#ifndef SOCKET_SIZE

View File

@@ -83,7 +83,7 @@
`endif
`ifndef NUM_CORES
`define NUM_CORES 2
`define NUM_CORES 4
`endif
`ifndef NUM_WARPS
@@ -250,7 +250,7 @@
`define NUM_ALU_LANES `NUM_THREADS
`endif
`ifndef NUM_ALU_BLOCKS
`define NUM_ALU_BLOCKS 4
`define NUM_ALU_BLOCKS 2
`endif
// Number of FPU units
@@ -258,7 +258,7 @@
`define NUM_FPU_LANES `NUM_THREADS
`endif
`ifndef NUM_FPU_BLOCKS
`define NUM_FPU_BLOCKS 2
`define NUM_FPU_BLOCKS 1
`endif
// Number of LSU units

View File

@@ -495,20 +495,30 @@ module Vortex import VX_gpu_pkg::*; #(
// .busy(busy)
// );
logic [31:0] finish_counter;
always @(posedge clock) begin
if (reset) begin
finish_counter <= 32'd0;
end else begin
if (finished) begin
finish_counter <= finish_counter + 32'd1;
end
end
end
// give slack for other cores to finish
wire all_cores_finished = (finish_counter > 32'd10000);
`ifdef SIMULATION
always @(posedge clock) begin
if (!reset) begin
if ((CORE_ID == '0) && all_cores_finished) begin
$display("simulation has probably ended. exiting");
$finish();
end
if (finished) begin
`ifdef SIMULATION
$display("---------------- core%2d has no more active warps ----------------", CORE_ID);
$display("simulation has ended. exiting");
$finish();
`endif
// `ifdef SIMULATION
// if ($time >= 60000) begin
// $display("simulation has probably ended. exiting");
// @(posedge clock) $finish();
// end
// `endif
$display("---------------- core%2d has no more active warps ----------------", CORE_ID);
// TODO: lane assumed to be 4
// `ifndef SYNTHESIS
// for (integer j = 0; j < `NUM_WARPS; j++) begin
@@ -525,6 +535,7 @@ module Vortex import VX_gpu_pkg::*; #(
end
end
end
`endif
endmodule : Vortex

View File

@@ -38,6 +38,7 @@
`ifdef SYNTHESIS
`define FPU_FPNEW
// `define FIRESIM
`endif // SYNTHESIS
`ifdef SV_DPI
@@ -78,7 +79,7 @@
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define TRACE(level, args) $write args
`else
`else // !SYNTHESIS
`ifdef VERILATOR
`define SIMULATION
`define TRACING_ON /* verilator tracing_on */
@@ -207,6 +208,7 @@
x \
/* verilator lint_on UNUSED */
`define TRACE(level, args) $write args
// `define TRACE(level, args) dpi_trace(level, $sformatf args)
`endif
`endif

View File

@@ -30,6 +30,11 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
`ifdef PERF_ENABLE
logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_read_per_warp;
logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_write_per_warp;
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_stream_buffer #(
.DATAW (DATAW)
@@ -150,6 +155,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
end
`endif
`ifdef PERF_ENABLE
logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs1_per_thread;
logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs2_per_thread;
logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs3_per_thread;
`endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
@@ -219,9 +230,61 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
.raddr (gpr_rd_addr_rs3),
.rdata (rs3_data[j])
);
`ifdef PERF_ENABLE
assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
`endif
end
`ifdef PERF_ENABLE
// read is done for all threads; write is masked
wire scoreboard_fire = scoreboard_if[i].valid && scoreboard_if[i].ready;
wire [`PERF_CTR_BITS-1:0] perf_read_rs1_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
wire [`PERF_CTR_BITS-1:0] perf_read_rs2_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
wire [`PERF_CTR_BITS-1:0] perf_read_rs3_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
assign perf_rf_read_per_warp[i] = perf_read_rs1_per_warp + perf_read_rs2_per_warp + perf_read_rs3_per_warp;
always @(*) begin
perf_rf_write_per_warp[i] = '0;
for (integer t = 0; t < `NUM_THREADS; ++t) begin
perf_rf_write_per_warp[i] = perf_rf_write_per_warp[i] +
perf_write_rs1_per_thread[t] +
perf_write_rs2_per_thread[t] +
perf_write_rs3_per_thread[t];
end
end
`endif
end
`ifdef PERF_ENABLE
logic [`PERF_CTR_BITS-1:0] perf_rf_read_per_cycle;
logic [`PERF_CTR_BITS-1:0] perf_rf_write_per_cycle;
always @(*) begin
perf_rf_read_per_cycle = '0;
perf_rf_write_per_cycle = '0;
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
perf_rf_read_per_cycle = perf_rf_read_per_cycle + perf_rf_read_per_warp[i];
perf_rf_write_per_cycle = perf_rf_write_per_cycle + perf_rf_write_per_warp[i];
end
end
logic [`PERF_CTR_BITS-1:0] perf_rf_reads;
logic [`PERF_CTR_BITS-1:0] perf_rf_writes;
always @(posedge clk) begin
if (reset) begin
perf_rf_reads <= '0;
perf_rf_writes <= '0;
end else begin
perf_rf_reads <= perf_rf_reads + perf_rf_read_per_cycle;
perf_rf_writes <= perf_rf_writes + perf_rf_write_per_cycle;
end
end
`endif
endmodule
`endif

View File

@@ -280,6 +280,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_id = gbar_req_id;
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
// NOTE(hansung): since CORE_ID is global across multiple clusters, we
// need the modulo to get the per-cluster local core id
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
`endif

View File

@@ -12,8 +12,9 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
);
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_THREADS;
// localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
localparam PARTIAL_BW = 1;
// FIXME: @perf: PARTIAL_BW==1 increases power instantiating
// stream_buffers for ISSUE_WIDTH times
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
@@ -410,6 +411,7 @@ module VX_tensor_octet #(
substeps_n = substeps;
if (operands_first_in_pair_fire) begin
// NOTE: substeps is only used for debugging
substeps_n[operands_wid_buf] = 1'b1; // ready for hmma
A_buffer_n[operands_wid_buf] = halves_buf.A_half;
B_buffer_n[operands_wid_buf] = halves_buf.B_half;
@@ -495,7 +497,7 @@ module VX_tensor_octet #(
wire outbuf_enq = outbuf_ready_in && dpu_valid;
wire outbuf_deq = result_valid && result_ready;
// buffer to stage the result D tile for 2 cycles until commit/writeback
// result buffer to stage the D tile for 2 cycles until commit/writeback
// is complete. This decouples the irregular dpu output traffic from the
// regular, every-2-cycle commit traffic to ensure the commit pipeline is
// used more efficiently.