Merge branch 'rtl' of https://github.com/hansungk/vortex-private into rtl
This commit is contained in:
@@ -84,7 +84,7 @@
|
||||
#endif
|
||||
|
||||
#ifndef NUM_CORES
|
||||
#define NUM_CORES 2
|
||||
#define NUM_CORES 4
|
||||
#endif
|
||||
|
||||
#ifndef NUM_WARPS
|
||||
@@ -96,7 +96,7 @@
|
||||
#endif
|
||||
|
||||
#ifndef NUM_BARRIERS
|
||||
#define NUM_BARRIERS 4
|
||||
#define NUM_BARRIERS 8
|
||||
#endif
|
||||
|
||||
#ifndef SOCKET_SIZE
|
||||
|
||||
@@ -83,7 +83,7 @@
|
||||
`endif
|
||||
|
||||
`ifndef NUM_CORES
|
||||
`define NUM_CORES 2
|
||||
`define NUM_CORES 4
|
||||
`endif
|
||||
|
||||
`ifndef NUM_WARPS
|
||||
@@ -250,7 +250,7 @@
|
||||
`define NUM_ALU_LANES `NUM_THREADS
|
||||
`endif
|
||||
`ifndef NUM_ALU_BLOCKS
|
||||
`define NUM_ALU_BLOCKS 4
|
||||
`define NUM_ALU_BLOCKS 2
|
||||
`endif
|
||||
|
||||
// Number of FPU units
|
||||
@@ -258,7 +258,7 @@
|
||||
`define NUM_FPU_LANES `NUM_THREADS
|
||||
`endif
|
||||
`ifndef NUM_FPU_BLOCKS
|
||||
`define NUM_FPU_BLOCKS 2
|
||||
`define NUM_FPU_BLOCKS 1
|
||||
`endif
|
||||
|
||||
// Number of LSU units
|
||||
|
||||
@@ -495,20 +495,30 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
// .busy(busy)
|
||||
// );
|
||||
|
||||
logic [31:0] finish_counter;
|
||||
|
||||
always @(posedge clock) begin
|
||||
if (reset) begin
|
||||
finish_counter <= 32'd0;
|
||||
end else begin
|
||||
if (finished) begin
|
||||
finish_counter <= finish_counter + 32'd1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// give slack for other cores to finish
|
||||
wire all_cores_finished = (finish_counter > 32'd10000);
|
||||
|
||||
`ifdef SIMULATION
|
||||
always @(posedge clock) begin
|
||||
if (!reset) begin
|
||||
if ((CORE_ID == '0) && all_cores_finished) begin
|
||||
$display("simulation has probably ended. exiting");
|
||||
$finish();
|
||||
end
|
||||
if (finished) begin
|
||||
`ifdef SIMULATION
|
||||
$display("---------------- core%2d has no more active warps ----------------", CORE_ID);
|
||||
$display("simulation has ended. exiting");
|
||||
$finish();
|
||||
`endif
|
||||
// `ifdef SIMULATION
|
||||
// if ($time >= 60000) begin
|
||||
// $display("simulation has probably ended. exiting");
|
||||
// @(posedge clock) $finish();
|
||||
// end
|
||||
// `endif
|
||||
$display("---------------- core%2d has no more active warps ----------------", CORE_ID);
|
||||
// TODO: lane assumed to be 4
|
||||
// `ifndef SYNTHESIS
|
||||
// for (integer j = 0; j < `NUM_WARPS; j++) begin
|
||||
@@ -525,6 +535,7 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule : Vortex
|
||||
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
|
||||
`ifdef SYNTHESIS
|
||||
`define FPU_FPNEW
|
||||
// `define FIRESIM
|
||||
`endif // SYNTHESIS
|
||||
|
||||
`ifdef SV_DPI
|
||||
@@ -78,7 +79,7 @@
|
||||
`define UNUSED_PIN(x) . x ()
|
||||
`define UNUSED_ARG(x) x
|
||||
`define TRACE(level, args) $write args
|
||||
`else
|
||||
`else // !SYNTHESIS
|
||||
`ifdef VERILATOR
|
||||
`define SIMULATION
|
||||
`define TRACING_ON /* verilator tracing_on */
|
||||
@@ -207,6 +208,7 @@
|
||||
x \
|
||||
/* verilator lint_on UNUSED */
|
||||
`define TRACE(level, args) $write args
|
||||
// `define TRACE(level, args) dpi_trace(level, $sformatf args)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
||||
@@ -30,6 +30,11 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_read_per_warp;
|
||||
logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_write_per_warp;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
VX_stream_buffer #(
|
||||
.DATAW (DATAW)
|
||||
@@ -150,6 +155,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs1_per_thread;
|
||||
logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs2_per_thread;
|
||||
logic [`NUM_THREADS-1:0][`PERF_CTR_BITS-1:0] perf_write_rs3_per_thread;
|
||||
`endif
|
||||
|
||||
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
@@ -219,9 +230,61 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
.raddr (gpr_rd_addr_rs3),
|
||||
.rdata (rs3_data[j])
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
`endif
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
// read is done for all threads; write is masked
|
||||
wire scoreboard_fire = scoreboard_if[i].valid && scoreboard_if[i].ready;
|
||||
wire [`PERF_CTR_BITS-1:0] perf_read_rs1_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
|
||||
wire [`PERF_CTR_BITS-1:0] perf_read_rs2_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
|
||||
wire [`PERF_CTR_BITS-1:0] perf_read_rs3_per_warp = (scoreboard_fire ? `NUM_THREADS : `PERF_CTR_BITS'b0);
|
||||
assign perf_rf_read_per_warp[i] = perf_read_rs1_per_warp + perf_read_rs2_per_warp + perf_read_rs3_per_warp;
|
||||
|
||||
always @(*) begin
|
||||
perf_rf_write_per_warp[i] = '0;
|
||||
for (integer t = 0; t < `NUM_THREADS; ++t) begin
|
||||
perf_rf_write_per_warp[i] = perf_rf_write_per_warp[i] +
|
||||
perf_write_rs1_per_thread[t] +
|
||||
perf_write_rs2_per_thread[t] +
|
||||
perf_write_rs3_per_thread[t];
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
logic [`PERF_CTR_BITS-1:0] perf_rf_read_per_cycle;
|
||||
logic [`PERF_CTR_BITS-1:0] perf_rf_write_per_cycle;
|
||||
|
||||
always @(*) begin
|
||||
perf_rf_read_per_cycle = '0;
|
||||
perf_rf_write_per_cycle = '0;
|
||||
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
perf_rf_read_per_cycle = perf_rf_read_per_cycle + perf_rf_read_per_warp[i];
|
||||
perf_rf_write_per_cycle = perf_rf_write_per_cycle + perf_rf_write_per_warp[i];
|
||||
end
|
||||
end
|
||||
|
||||
logic [`PERF_CTR_BITS-1:0] perf_rf_reads;
|
||||
logic [`PERF_CTR_BITS-1:0] perf_rf_writes;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_rf_reads <= '0;
|
||||
perf_rf_writes <= '0;
|
||||
end else begin
|
||||
perf_rf_reads <= perf_rf_reads + perf_rf_read_per_cycle;
|
||||
perf_rf_writes <= perf_rf_writes + perf_rf_write_per_cycle;
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
|
||||
@@ -280,6 +280,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
assign gbar_bus_if.req_valid = gbar_req_valid;
|
||||
assign gbar_bus_if.req_id = gbar_req_id;
|
||||
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
|
||||
// NOTE(hansung): since CORE_ID is global across multiple clusters, we
|
||||
// need the modulo to get the per-cluster local core id
|
||||
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
|
||||
`endif
|
||||
|
||||
|
||||
@@ -12,8 +12,9 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
);
|
||||
localparam BLOCK_SIZE = 1;
|
||||
localparam NUM_LANES = `NUM_THREADS;
|
||||
// localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
localparam PARTIAL_BW = 1;
|
||||
// FIXME: @perf: PARTIAL_BW==1 increases power instantiating
|
||||
// stream_buffers for ISSUE_WIDTH times
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
@@ -410,6 +411,7 @@ module VX_tensor_octet #(
|
||||
substeps_n = substeps;
|
||||
|
||||
if (operands_first_in_pair_fire) begin
|
||||
// NOTE: substeps is only used for debugging
|
||||
substeps_n[operands_wid_buf] = 1'b1; // ready for hmma
|
||||
A_buffer_n[operands_wid_buf] = halves_buf.A_half;
|
||||
B_buffer_n[operands_wid_buf] = halves_buf.B_half;
|
||||
@@ -495,7 +497,7 @@ module VX_tensor_octet #(
|
||||
wire outbuf_enq = outbuf_ready_in && dpu_valid;
|
||||
wire outbuf_deq = result_valid && result_ready;
|
||||
|
||||
// buffer to stage the result D tile for 2 cycles until commit/writeback
|
||||
// result buffer to stage the D tile for 2 cycles until commit/writeback
|
||||
// is complete. This decouples the irregular dpu output traffic from the
|
||||
// regular, every-2-cycle commit traffic to ensure the commit pipeline is
|
||||
// used more efficiently.
|
||||
|
||||
Reference in New Issue
Block a user