Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -1,7 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module Vortex (
`SCOPE_IO_Vortex
module Vortex import VX_gpu_pkg::*; (
`SCOPE_IO_DECL
// Clock
input wire clk,
@@ -22,204 +35,186 @@ module Vortex (
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// DCR write request
input wire dcr_wr_valid,
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
// Status
output wire busy
);
`STATIC_ASSERT((`L3_ENABLE == 0 || `NUM_CLUSTERS > 1), ("invalid parameter"))
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_valid;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_rw;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_BYTEEN_WIDTH-1:0] per_cluster_mem_req_byteen;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_ADDR_WIDTH-1:0] per_cluster_mem_req_addr;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_DATA_WIDTH-1:0] per_cluster_mem_req_data;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_TAG_WIDTH-1:0] per_cluster_mem_req_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
cache_perf_t perf_l3cache;
mem_perf_t mem_perf;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_valid;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_DATA_WIDTH-1:0] per_cluster_mem_rsp_data;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_ready;
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.l3cache = perf_l3cache;
assign mem_perf_if.smem = 'x;
assign mem_perf_if.mem = mem_perf;
`endif
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
VX_mem_bus_if #(
.DATA_SIZE (`L2_LINE_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin
VX_mem_bus_if #(
.DATA_SIZE (`L3_LINE_SIZE),
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
) mem_bus_if();
`RESET_RELAY (cluster_reset);
`RESET_RELAY (l3_reset, reset);
VX_cluster #(
.CLUSTER_ID(i)
) cluster (
`SCOPE_BIND_Vortex_cluster(i)
VX_cache_wrap #(
.INSTANCE_ID ("l3cache"),
.CACHE_SIZE (`L3_CACHE_SIZE),
.LINE_SIZE (`L3_LINE_SIZE),
.NUM_BANKS (`L3_NUM_BANKS),
.NUM_WAYS (`L3_NUM_WAYS),
.WORD_SIZE (L3_WORD_SIZE),
.NUM_REQS (L3_NUM_REQS),
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2),
.MEM_OUT_REG (2),
.NC_ENABLE (1),
.PASSTHRU (!`L3_ENABLED)
) l3cache (
.clk (clk),
.reset (l3_reset),
.clk (clk),
.reset (cluster_reset),
.mem_req_valid (per_cluster_mem_req_valid [i]),
.mem_req_rw (per_cluster_mem_req_rw [i]),
.mem_req_byteen (per_cluster_mem_req_byteen[i]),
.mem_req_addr (per_cluster_mem_req_addr [i]),
.mem_req_data (per_cluster_mem_req_data [i]),
.mem_req_tag (per_cluster_mem_req_tag [i]),
.mem_req_ready (per_cluster_mem_req_ready [i]),
.mem_rsp_valid (per_cluster_mem_rsp_valid [i]),
.mem_rsp_data (per_cluster_mem_rsp_data [i]),
.mem_rsp_tag (per_cluster_mem_rsp_tag [i]),
.mem_rsp_ready (per_cluster_mem_rsp_ready [i]),
.busy (per_cluster_busy [i])
);
end
assign busy = (| per_cluster_busy);
if (`L3_ENABLE) begin
`ifdef PERF_ENABLE
VX_perf_cache_if perf_l3cache_if();
.cache_perf (perf_l3cache),
`endif
`RESET_RELAY (l3_reset);
.core_bus_if (per_cluster_mem_bus_if),
.mem_bus_if (mem_bus_if)
);
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen= mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
wire sim_ebreak /* verilator public */;
wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value /* verilator public */;
wire [`NUM_CLUSTERS-1:0] per_cluster_sim_ebreak;
wire [`NUM_CLUSTERS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_cluster_sim_wb_value;
assign sim_ebreak = per_cluster_sim_ebreak[0];
assign sim_wb_value = per_cluster_sim_wb_value[0];
`UNUSED_VAR (per_cluster_sim_ebreak)
`UNUSED_VAR (per_cluster_sim_wb_value)
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_wr_valid;
assign dcr_bus_if.write_addr = dcr_wr_addr;
assign dcr_bus_if.write_data = dcr_wr_data;
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS)
// Generate all clusters
for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin
`RESET_RELAY (cluster_reset, reset);
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
VX_cluster #(
.CLUSTER_ID (i)
) cluster (
`SCOPE_IO_BIND (i)
VX_cache #(
.CACHE_ID (`L3_CACHE_ID),
.CACHE_SIZE (`L3_CACHE_SIZE),
.CACHE_LINE_SIZE (`L3_CACHE_LINE_SIZE),
.NUM_BANKS (`L3_NUM_BANKS),
.NUM_PORTS (`L3_NUM_PORTS),
.WORD_SIZE (`L3_WORD_SIZE),
.NUM_REQS (`L3_NUM_REQS),
.CREQ_SIZE (`L3_CREQ_SIZE),
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`L2_MEM_TAG_WIDTH),
.CORE_TAG_ID_BITS (0),
.MEM_TAG_WIDTH (`L3_MEM_TAG_WIDTH),
.NC_ENABLE (1)
) l3cache (
`SCOPE_BIND_Vortex_l3cache
.clk (clk),
.reset (l3_reset),
.reset (cluster_reset),
`ifdef PERF_ENABLE
.perf_cache_if (perf_l3cache_if),
.mem_perf_if (mem_perf_if),
`endif
// Core request
.core_req_valid (per_cluster_mem_req_valid),
.core_req_rw (per_cluster_mem_req_rw),
.core_req_byteen (per_cluster_mem_req_byteen),
.core_req_addr (per_cluster_mem_req_addr),
.core_req_data (per_cluster_mem_req_data),
.core_req_tag (per_cluster_mem_req_tag),
.core_req_ready (per_cluster_mem_req_ready),
// Core response
.core_rsp_valid (per_cluster_mem_rsp_valid),
.core_rsp_data (per_cluster_mem_rsp_data),
.core_rsp_tag (per_cluster_mem_rsp_tag),
.core_rsp_ready (per_cluster_mem_rsp_ready),
`UNUSED_PIN (core_rsp_tmask),
// Memory request
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
// Memory response
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready)
);
end else begin
`RESET_RELAY (mem_arb_reset);
VX_mem_arb #(
.NUM_REQS (`NUM_CLUSTERS),
.DATA_WIDTH (`L3_MEM_DATA_WIDTH),
.ADDR_WIDTH (`L3_MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`L2_MEM_TAG_WIDTH),
.TYPE ("R"),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
// Core request
.req_valid_in (per_cluster_mem_req_valid),
.req_rw_in (per_cluster_mem_req_rw),
.req_byteen_in (per_cluster_mem_req_byteen),
.req_addr_in (per_cluster_mem_req_addr),
.req_data_in (per_cluster_mem_req_data),
.req_tag_in (per_cluster_mem_req_tag),
.req_ready_in (per_cluster_mem_req_ready),
// Memory request
.req_valid_out (mem_req_valid),
.req_rw_out (mem_req_rw),
.req_byteen_out (mem_req_byteen),
.req_addr_out (mem_req_addr),
.req_data_out (mem_req_data),
.req_tag_out (mem_req_tag),
.req_ready_out (mem_req_ready),
// Core response
.rsp_valid_out (per_cluster_mem_rsp_valid),
.rsp_data_out (per_cluster_mem_rsp_data),
.rsp_tag_out (per_cluster_mem_rsp_tag),
.rsp_ready_out (per_cluster_mem_rsp_ready),
// Memory response
.rsp_valid_in (mem_rsp_valid),
.rsp_tag_in (mem_rsp_tag),
.rsp_data_in (mem_rsp_data),
.rsp_ready_in (mem_rsp_ready)
);
.dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[i]),
.sim_ebreak (per_cluster_sim_ebreak[i]),
.sim_wb_value (per_cluster_sim_wb_value[i]),
.busy (per_cluster_busy[i])
);
end
`SCOPE_ASSIGN (reset, reset);
`SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready);
`SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr));
`SCOPE_ASSIGN (mem_req_rw, mem_req_rw);
`SCOPE_ASSIGN (mem_req_byteen, mem_req_byteen);
`SCOPE_ASSIGN (mem_req_data, mem_req_data);
`SCOPE_ASSIGN (mem_req_tag, mem_req_tag);
`SCOPE_ASSIGN (mem_rsp_fire, mem_rsp_valid && mem_rsp_ready);
`SCOPE_ASSIGN (mem_rsp_data, mem_rsp_data);
`SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag);
`SCOPE_ASSIGN (busy, busy);
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1));
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
always @(posedge clk) begin
if (reset) begin
perf_mem_pending_reads <= '0;
end else begin
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
end
end
always @(posedge clk) begin
if (reset) begin
mem_perf <= '0;
end else begin
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1);
end
if (mem_req_fire && mem_bus_if.req_data.rw) begin
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
end
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end
end
`endif
`ifdef DBG_TRACE_CORE_MEM
always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin
if (mem_req_fire) begin
if (mem_req_rw)
dpi_trace("%d: MEM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data);
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
else
dpi_trace("%d: MEM Rd Req: addr=%0h, tag=%0h, byteen=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen);
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
end
if (mem_rsp_valid && mem_rsp_ready) begin
dpi_trace("%d: MEM Rsp: tag=%0h, data=%0h\n", $time, mem_rsp_tag, mem_rsp_data);
if (mem_rsp_fire) begin
`TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data));
end
end
`endif
`ifndef NDEBUG
`ifdef SIMULATION
always @(posedge clk) begin
$fflush(); // flush stdout buffer
end
`endif
endmodule
endmodule