mutiple fixes: parallel printf, fixed cycle in cache, opencl refactored vecadd and sgemm, regen opencl kernels with hard-float, fixed vortex io bus interface, fixed dpi floats APi to support multicore mode, make vlsim multicore default, make rtlsim multi-core default, removed POCL binaries from repository, updated Makefiles to use external POCL

This commit is contained in:
Blaise Tine
2020-09-19 14:45:42 -04:00
parent 80f929eb61
commit f6f95e0c46
146 changed files with 116779 additions and 194258 deletions

View File

@@ -107,4 +107,7 @@ make -C top clean && make -C top > top/build.log 2>&1 &
if slack = +1.664 -> minimal period = 5-1.664 = 3.336 -> fmax = 1/3.336 = 300 Mhz
# build rtlsim from driver tests
make -C ../../rtlsim clean && reset && make -C ../../rtlsim
make -C ../../rtlsim clean && reset && make -C ../../rtlsim
# split tar into multiple parts
split -b 50M home.tar.bz2 "home.tar.bz2.part"

View File

@@ -41,11 +41,11 @@ module VX_cluster #(
input wire snp_rsp_ready,
// I/O request
output wire io_req_valid,
output wire [`NUM_THREADS-1:0] io_req_valid,
output wire io_req_rw,
output wire [3:0] io_req_byteen,
output wire [29:0] io_req_addr,
output wire [31:0] io_req_data,
output wire [`NUM_THREADS-1:0][3:0] io_req_byteen,
output wire [`NUM_THREADS-1:0][29:0] io_req_addr,
output wire [`NUM_THREADS-1:0][31:0] io_req_data,
output wire [`L2CORE_TAG_WIDTH-1:0] io_req_tag,
input wire io_req_ready,
@@ -108,11 +108,11 @@ module VX_cluster #(
wire [`NUM_CORES-1:0][`DSNP_TAG_WIDTH-1:0] per_core_snp_rsp_tag;
wire [`NUM_CORES-1:0] per_core_snp_rsp_ready;
wire [`NUM_CORES-1:0] per_core_io_req_valid;
wire [`NUM_CORES-1:0][`NUM_THREADS-1:0] per_core_io_req_valid;
wire [`NUM_CORES-1:0] per_core_io_req_rw;
wire [`NUM_CORES-1:0][3:0] per_core_io_req_byteen;
wire [`NUM_CORES-1:0][29:0] per_core_io_req_addr;
wire [`NUM_CORES-1:0][31:0] per_core_io_req_data;
wire [`NUM_CORES-1:0][`NUM_THREADS-1:0][3:0] per_core_io_req_byteen;
wire [`NUM_CORES-1:0][`NUM_THREADS-1:0][29:0] per_core_io_req_addr;
wire [`NUM_CORES-1:0][`NUM_THREADS-1:0][31:0] per_core_io_req_data;
wire [`NUM_CORES-1:0][`DCORE_TAG_WIDTH-1:0] per_core_io_req_tag;
wire [`NUM_CORES-1:0] per_core_io_req_ready;
@@ -209,7 +209,7 @@ module VX_cluster #(
);
end
VX_mem_arb #(
VX_io_arb #(
.NUM_REQUESTS (`NUM_CORES),
.WORD_SIZE (4),
.TAG_IN_WIDTH (`DCORE_TAG_WIDTH),
@@ -219,34 +219,34 @@ module VX_cluster #(
.reset (reset),
// input requests
.in_mem_req_valid (per_core_io_req_valid),
.in_mem_req_rw (per_core_io_req_rw),
.in_mem_req_byteen (per_core_io_req_byteen),
.in_mem_req_addr (per_core_io_req_addr),
.in_mem_req_data (per_core_io_req_data),
.in_mem_req_tag (per_core_io_req_tag),
.in_mem_req_ready (per_core_io_req_ready),
.in_io_req_valid (per_core_io_req_valid),
.in_io_req_rw (per_core_io_req_rw),
.in_io_req_byteen (per_core_io_req_byteen),
.in_io_req_addr (per_core_io_req_addr),
.in_io_req_data (per_core_io_req_data),
.in_io_req_tag (per_core_io_req_tag),
.in_io_req_ready (per_core_io_req_ready),
// input responses
.in_mem_rsp_valid (per_core_io_rsp_valid),
.in_mem_rsp_data (per_core_io_rsp_data),
.in_mem_rsp_tag (per_core_io_rsp_tag),
.in_mem_rsp_ready (per_core_io_rsp_ready),
.in_io_rsp_valid (per_core_io_rsp_valid),
.in_io_rsp_data (per_core_io_rsp_data),
.in_io_rsp_tag (per_core_io_rsp_tag),
.in_io_rsp_ready (per_core_io_rsp_ready),
// output request
.out_mem_req_valid (io_req_valid),
.out_mem_req_rw (io_req_rw),
.out_mem_req_byteen (io_req_byteen),
.out_mem_req_addr (io_req_addr),
.out_mem_req_data (io_req_data),
.out_mem_req_tag (io_req_tag),
.out_mem_req_ready (io_req_ready),
.out_io_req_valid (io_req_valid),
.out_io_req_rw (io_req_rw),
.out_io_req_byteen (io_req_byteen),
.out_io_req_addr (io_req_addr),
.out_io_req_data (io_req_data),
.out_io_req_tag (io_req_tag),
.out_io_req_ready (io_req_ready),
// output response
.out_mem_rsp_valid (io_rsp_valid),
.out_mem_rsp_tag (io_rsp_tag),
.out_mem_rsp_data (io_rsp_data),
.out_mem_rsp_ready (io_rsp_ready)
.out_io_rsp_valid (io_rsp_valid),
.out_io_rsp_tag (io_rsp_tag),
.out_io_rsp_data (io_rsp_data),
.out_io_rsp_ready (io_rsp_ready)
);
VX_csr_io_arb #(
@@ -369,9 +369,7 @@ module VX_cluster #(
.SNRQ_SIZE (`L2SNRQ_SIZE),
.CWBQ_SIZE (`L2CWBQ_SIZE),
.DWBQ_SIZE (`L2DWBQ_SIZE),
.DFQQ_SIZE (`L2DFQQ_SIZE),
.PRFQ_SIZE (`L2PRFQ_SIZE),
.PRFQ_STRIDE (`L2PRFQ_STRIDE),
.DFQQ_SIZE (`L2DFQQ_SIZE),
.DRAM_ENABLE (1),
.WRITE_ENABLE (1),
.SNOOP_FORWARDING (1),

View File

@@ -226,15 +226,6 @@
`define DDFQQ_SIZE `DCREQ_SIZE
`endif
// Prefetcher
`ifndef DPRFQ_SIZE
`define DPRFQ_SIZE 8
`endif
`ifndef DPRFQ_STRIDE
`define DPRFQ_STRIDE 0
`endif
// Icache Configurable Knobs ==================================================
// Size of cache in bytes
@@ -287,15 +278,6 @@
`define IDFQQ_SIZE `ICREQ_SIZE
`endif
// Prefetcher
`ifndef IPRFQ_SIZE
`define IPRFQ_SIZE 8
`endif
`ifndef IPRFQ_STRIDE
`define IPRFQ_STRIDE 0
`endif
// SM Configurable Knobs ======================================================
// Size of cache in bytes
@@ -385,15 +367,6 @@
`define L2DFQQ_SIZE `L2CREQ_SIZE
`endif
// Prefetcher
`ifndef L2PRFQ_SIZE
`define L2PRFQ_SIZE 8
`endif
`ifndef L2PRFQ_STRIDE
`define L2PRFQ_STRIDE 0
`endif
// L3cache Configurable Knobs =================================================
// Size of cache in bytes
@@ -451,13 +424,4 @@
`define L3DFQQ_SIZE `L3CREQ_SIZE
`endif
// Prefetcher
`ifndef L3PRFQ_SIZE
`define L3PRFQ_SIZE 8
`endif
`ifndef L3PRFQ_STRIDE
`define L3PRFQ_STRIDE 0
`endif
`endif

View File

@@ -55,11 +55,11 @@ module VX_core #(
input wire snp_rsp_ready,
// I/O request
output wire io_req_valid,
output wire [`NUM_THREADS-1:0] io_req_valid,
output wire io_req_rw,
output wire [3:0] io_req_byteen,
output wire [29:0] io_req_addr,
output wire [31:0] io_req_data,
output wire [`NUM_THREADS-1:0][3:0] io_req_byteen,
output wire [`NUM_THREADS-1:0][29:0] io_req_addr,
output wire [`NUM_THREADS-1:0][31:0] io_req_data,
output wire [`DCORE_TAG_WIDTH-1:0] io_req_tag,
input wire io_req_ready,
@@ -123,12 +123,12 @@ module VX_core #(
.CORE_TAG_ID_BITS(`DCORE_TAG_ID_BITS)
) core_dcache_rsp_if(), arb_dcache_rsp_if(), arb_io_rsp_if();
assign io_req_valid = arb_io_req_if.valid[0];
assign io_req_rw = arb_io_req_if.rw[0];
assign io_req_byteen = arb_io_req_if.byteen[0];
assign io_req_addr = arb_io_req_if.addr[0];
assign io_req_data = arb_io_req_if.data[0];
assign io_req_tag = arb_io_req_if.tag[0];
assign io_req_valid = arb_io_req_if.valid;
assign io_req_rw = arb_io_req_if.rw;
assign io_req_byteen = arb_io_req_if.byteen;
assign io_req_addr = arb_io_req_if.addr;
assign io_req_data = arb_io_req_if.data;
assign io_req_tag = arb_io_req_if.tag;
assign arb_io_req_if.ready = io_req_ready;
assign arb_io_rsp_if.valid = {{(`NUM_THREADS-1){1'b0}}, io_rsp_valid};

View File

@@ -102,8 +102,8 @@ module VX_csr_unit #(
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_LTID) ? i :
(csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
csr_read_data_s1;
(csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
csr_read_data_s1;
end
// can accept new request?

107
hw/rtl/VX_io_arb.v Normal file
View File

@@ -0,0 +1,107 @@
`include "VX_define.vh"
module VX_io_arb #(
parameter NUM_REQUESTS = 1,
parameter WORD_SIZE = 1,
parameter TAG_IN_WIDTH = 1,
parameter TAG_OUT_WIDTH = 1,
parameter WORD_WIDTH = WORD_SIZE * 8,
parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE),
parameter REQS_BITS = `CLOG2(NUM_REQUESTS)
) (
input wire clk,
input wire reset,
// input requests
input wire [NUM_REQUESTS-1:0][`NUM_THREADS-1:0] in_io_req_valid,
input wire [NUM_REQUESTS-1:0] in_io_req_rw,
input wire [NUM_REQUESTS-1:0][`NUM_THREADS-1:0][WORD_SIZE-1:0] in_io_req_byteen,
input wire [NUM_REQUESTS-1:0][`NUM_THREADS-1:0][ADDR_WIDTH-1:0] in_io_req_addr,
input wire [NUM_REQUESTS-1:0][`NUM_THREADS-1:0][WORD_WIDTH-1:0] in_io_req_data,
input wire [NUM_REQUESTS-1:0][TAG_IN_WIDTH-1:0] in_io_req_tag,
output wire [NUM_REQUESTS-1:0] in_io_req_ready,
// input response
output wire [NUM_REQUESTS-1:0] in_io_rsp_valid,
output wire [NUM_REQUESTS-1:0][WORD_WIDTH-1:0] in_io_rsp_data,
output wire [NUM_REQUESTS-1:0][TAG_IN_WIDTH-1:0] in_io_rsp_tag,
input wire [NUM_REQUESTS-1:0] in_io_rsp_ready,
// output request
output wire [`NUM_THREADS-1:0] out_io_req_valid,
output wire out_io_req_rw,
output wire [`NUM_THREADS-1:0][WORD_SIZE-1:0] out_io_req_byteen,
output wire [`NUM_THREADS-1:0][ADDR_WIDTH-1:0] out_io_req_addr,
output wire [`NUM_THREADS-1:0][WORD_WIDTH-1:0] out_io_req_data,
output wire [TAG_OUT_WIDTH-1:0] out_io_req_tag,
input wire out_io_req_ready,
// output response
input wire out_io_rsp_valid,
input wire [WORD_WIDTH-1:0] out_io_rsp_data,
input wire [TAG_OUT_WIDTH-1:0] out_io_rsp_tag,
output wire out_io_rsp_ready
);
if (NUM_REQUESTS == 1) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign out_io_req_valid = in_io_req_valid;
assign out_io_req_rw = in_io_req_rw;
assign out_io_req_byteen = in_io_req_byteen;
assign out_io_req_addr = in_io_req_addr;
assign out_io_req_data = in_io_req_data;
assign out_io_req_tag = in_io_req_tag;
assign in_io_req_ready = out_io_req_ready;
assign in_io_rsp_valid = out_io_rsp_valid;
assign in_io_rsp_data = out_io_rsp_data;
assign in_io_rsp_tag = out_io_rsp_tag;
assign out_io_rsp_ready = in_io_rsp_ready;
end else begin
reg [REQS_BITS-1:0] bus_req_sel;
wire [NUM_REQUESTS-1:0] valid_requests;
for (genvar i = 0; i < NUM_REQUESTS; i++) begin
assign valid_requests[i] = (| in_io_req_valid[i]);
end
VX_rr_arbiter #(
.N(NUM_REQUESTS)
) arbiter (
.clk (clk),
.reset (reset),
.requests (valid_requests),
.grant_index (bus_req_sel),
`UNUSED_PIN (grant_valid),
`UNUSED_PIN (grant_onehot)
);
assign out_io_req_valid = in_io_req_valid [bus_req_sel];
assign out_io_req_rw = in_io_req_rw [bus_req_sel];
assign out_io_req_byteen = in_io_req_byteen [bus_req_sel];
assign out_io_req_addr = in_io_req_addr [bus_req_sel];
assign out_io_req_data = in_io_req_data [bus_req_sel];
assign out_io_req_tag = {in_io_req_tag [bus_req_sel], REQS_BITS'(bus_req_sel)};
for (genvar i = 0; i < NUM_REQUESTS; i++) begin
assign in_io_req_ready[i] = out_io_req_ready && (bus_req_sel == REQS_BITS'(i));
end
wire [REQS_BITS-1:0] bus_rsp_sel = out_io_rsp_tag[REQS_BITS-1:0];
for (genvar i = 0; i < NUM_REQUESTS; i++) begin
assign in_io_rsp_valid[i] = out_io_rsp_valid && (bus_rsp_sel == REQS_BITS'(i));
assign in_io_rsp_data[i] = out_io_rsp_data;
assign in_io_rsp_tag[i] = out_io_rsp_tag[REQS_BITS +: TAG_IN_WIDTH];
end
assign out_io_rsp_ready = in_io_rsp_ready[bus_rsp_sel];
end
endmodule

View File

@@ -137,7 +137,7 @@ module VX_lsu_unit #(
// Core Request
assign dcache_req_if.valid = {`NUM_THREADS{valid_in && ~lsuq_full && ~store_stall}} & req_tmask;
assign dcache_req_if.rw = {`NUM_THREADS{req_rw}};
assign dcache_req_if.rw = req_rw;
assign dcache_req_if.byteen = req_byteen;
assign dcache_req_if.addr = req_addr;
assign dcache_req_if.data = req_data;

View File

@@ -70,8 +70,6 @@ module VX_mem_unit # (
.CWBQ_SIZE (`SCWBQ_SIZE),
.DWBQ_SIZE (1),
.DFQQ_SIZE (1),
.PRFQ_SIZE (1),
.PRFQ_STRIDE (0),
.SNOOP_FORWARDING (0),
.DRAM_ENABLE (0),
.WRITE_ENABLE (1),
@@ -153,8 +151,6 @@ module VX_mem_unit # (
.CWBQ_SIZE (`DCWBQ_SIZE),
.DWBQ_SIZE (`DDWBQ_SIZE),
.DFQQ_SIZE (`DDFQQ_SIZE),
.PRFQ_SIZE (`DPRFQ_SIZE),
.PRFQ_STRIDE (`DPRFQ_STRIDE),
.SNOOP_FORWARDING (0),
.DRAM_ENABLE (1),
.WRITE_ENABLE (1),
@@ -237,8 +233,6 @@ module VX_mem_unit # (
.CWBQ_SIZE (`ICWBQ_SIZE),
.DWBQ_SIZE (`IDWBQ_SIZE),
.DFQQ_SIZE (`IDFQQ_SIZE),
.PRFQ_SIZE (`IPRFQ_SIZE),
.PRFQ_STRIDE (`IPRFQ_STRIDE),
.SNOOP_FORWARDING (0),
.DRAM_ENABLE (1),
.WRITE_ENABLE (0),

View File

@@ -14,7 +14,7 @@ module VX_pipeline #(
// Dcache core request
output wire [`NUM_THREADS-1:0] dcache_req_valid,
output wire [`NUM_THREADS-1:0] dcache_req_rw,
output wire dcache_req_rw,
output wire [`NUM_THREADS-1:0][3:0] dcache_req_byteen,
output wire [`NUM_THREADS-1:0][29:0] dcache_req_addr,
output wire [`NUM_THREADS-1:0][31:0] dcache_req_data,

View File

@@ -39,11 +39,11 @@ module Vortex (
input wire snp_rsp_ready,
// I/O request
output wire io_req_valid,
output wire [`NUM_THREADS-1:0] io_req_valid,
output wire io_req_rw,
output wire [3:0] io_req_byteen,
output wire [29:0] io_req_addr,
output wire [31:0] io_req_data,
output wire [`NUM_THREADS-1:0][3:0] io_req_byteen,
output wire [`NUM_THREADS-1:0][29:0] io_req_addr,
output wire [`NUM_THREADS-1:0][31:0] io_req_data,
output wire [`VX_CORE_TAG_WIDTH-1:0] io_req_tag,
input wire io_req_ready,
@@ -160,11 +160,11 @@ module Vortex (
wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] per_cluster_snp_rsp_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_snp_rsp_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_io_req_valid;
wire [`NUM_CLUSTERS-1:0][`NUM_THREADS-1:0] per_cluster_io_req_valid;
wire [`NUM_CLUSTERS-1:0] per_cluster_io_req_rw;
wire [`NUM_CLUSTERS-1:0][3:0] per_cluster_io_req_byteen;
wire [`NUM_CLUSTERS-1:0][29:0] per_cluster_io_req_addr;
wire [`NUM_CLUSTERS-1:0][31:0] per_cluster_io_req_data;
wire [`NUM_CLUSTERS-1:0][`NUM_THREADS-1:0][3:0] per_cluster_io_req_byteen;
wire [`NUM_CLUSTERS-1:0][`NUM_THREADS-1:0][29:0] per_cluster_io_req_addr;
wire [`NUM_CLUSTERS-1:0][`NUM_THREADS-1:0][31:0] per_cluster_io_req_data;
wire [`NUM_CLUSTERS-1:0][`L2CORE_TAG_WIDTH-1:0] per_cluster_io_req_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_io_req_ready;
@@ -254,7 +254,7 @@ module Vortex (
);
end
VX_mem_arb #(
VX_io_arb #(
.NUM_REQUESTS (`NUM_CLUSTERS),
.WORD_SIZE (4),
.TAG_IN_WIDTH (`L2CORE_TAG_WIDTH),
@@ -264,34 +264,34 @@ module Vortex (
.reset (reset),
// input requests
.in_mem_req_valid (per_cluster_io_req_valid),
.in_mem_req_rw (per_cluster_io_req_rw),
.in_mem_req_byteen (per_cluster_io_req_byteen),
.in_mem_req_addr (per_cluster_io_req_addr),
.in_mem_req_data (per_cluster_io_req_data),
.in_mem_req_tag (per_cluster_io_req_tag),
.in_mem_req_ready (per_cluster_io_req_ready),
.in_io_req_valid (per_cluster_io_req_valid),
.in_io_req_rw (per_cluster_io_req_rw),
.in_io_req_byteen (per_cluster_io_req_byteen),
.in_io_req_addr (per_cluster_io_req_addr),
.in_io_req_data (per_cluster_io_req_data),
.in_io_req_tag (per_cluster_io_req_tag),
.in_io_req_ready (per_cluster_io_req_ready),
// input responses
.in_mem_rsp_valid (per_cluster_io_rsp_valid),
.in_mem_rsp_data (per_cluster_io_rsp_data),
.in_mem_rsp_tag (per_cluster_io_rsp_tag),
.in_mem_rsp_ready (per_cluster_io_rsp_ready),
.in_io_rsp_valid (per_cluster_io_rsp_valid),
.in_io_rsp_data (per_cluster_io_rsp_data),
.in_io_rsp_tag (per_cluster_io_rsp_tag),
.in_io_rsp_ready (per_cluster_io_rsp_ready),
// output request
.out_mem_req_valid (io_req_valid),
.out_mem_req_rw (io_req_rw),
.out_mem_req_byteen (io_req_byteen),
.out_mem_req_addr (io_req_addr),
.out_mem_req_data (io_req_data),
.out_mem_req_tag (io_req_tag),
.out_mem_req_ready (io_req_ready),
.out_io_req_valid (io_req_valid),
.out_io_req_rw (io_req_rw),
.out_io_req_byteen (io_req_byteen),
.out_io_req_addr (io_req_addr),
.out_io_req_data (io_req_data),
.out_io_req_tag (io_req_tag),
.out_io_req_ready (io_req_ready),
// output response
.out_mem_rsp_valid (io_rsp_valid),
.out_mem_rsp_tag (io_rsp_tag),
.out_mem_rsp_data (io_rsp_data),
.out_mem_rsp_ready (io_rsp_ready)
.out_io_rsp_valid (io_rsp_valid),
.out_io_rsp_tag (io_rsp_tag),
.out_io_rsp_data (io_rsp_data),
.out_io_rsp_ready (io_rsp_ready)
);
VX_csr_io_arb #(
@@ -397,8 +397,6 @@ module Vortex (
.CWBQ_SIZE (`L3CWBQ_SIZE),
.DWBQ_SIZE (`L3DWBQ_SIZE),
.DFQQ_SIZE (`L3DFQQ_SIZE),
.PRFQ_SIZE (`L3PRFQ_SIZE),
.PRFQ_STRIDE (`L3PRFQ_STRIDE),
.DRAM_ENABLE (1),
.WRITE_ENABLE (1),
.SNOOP_FORWARDING (1),

View File

@@ -57,7 +57,7 @@ module VX_bank #(
// Core Request
input wire [NUM_REQUESTS-1:0] core_req_valid,
input wire [NUM_REQUESTS-1:0] core_req_rw,
input wire [`CORE_REQ_TAG_COUNT-1:0] core_req_rw,
input wire [NUM_REQUESTS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] core_req_data,

View File

@@ -18,7 +18,7 @@ module VX_bank_core_req_arb #(
// Enqueue Data
input wire reqq_push,
input wire [NUM_REQUESTS-1:0] bank_valids,
input wire [NUM_REQUESTS-1:0] bank_rw,
input wire [`CORE_REQ_TAG_COUNT-1:0] bank_rw,
input wire [NUM_REQUESTS-1:0][WORD_SIZE-1:0] bank_byteen,
input wire [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] bank_writedata,
input wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] bank_addr,
@@ -40,21 +40,21 @@ module VX_bank_core_req_arb #(
);
wire [NUM_REQUESTS-1:0] out_per_valids;
wire [NUM_REQUESTS-1:0] out_per_rw;
wire [`CORE_REQ_TAG_COUNT-1:0] out_per_rw;
wire [NUM_REQUESTS-1:0][WORD_SIZE-1:0] out_per_byteen;
wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] out_per_addr;
wire [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] out_per_writedata;
wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] out_per_tag;
reg [NUM_REQUESTS-1:0] use_per_valids;
reg [NUM_REQUESTS-1:0] use_per_rw;
reg [`CORE_REQ_TAG_COUNT-1:0] use_per_rw;
reg [NUM_REQUESTS-1:0][WORD_SIZE-1:0] use_per_byteen;
reg [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] use_per_addr;
reg [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] use_per_writedata;
reg [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] use_per_tag;
wire [NUM_REQUESTS-1:0] qual_valids;
wire [NUM_REQUESTS-1:0] qual_rw;
wire [`CORE_REQ_TAG_COUNT-1:0] qual_rw;
wire [NUM_REQUESTS-1:0][WORD_SIZE-1:0] qual_byteen;
wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] qual_addr;
wire [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] qual_writedata;
@@ -108,16 +108,17 @@ module VX_bank_core_req_arb #(
assign reqq_empty = !qual_has_request;
assign reqq_req_st0 = qual_has_request;
assign reqq_req_tid_st0 = qual_request_index;
assign reqq_req_rw_st0 = qual_rw[qual_request_index];
assign reqq_req_tid_st0 = qual_request_index;
assign reqq_req_byteen_st0 = qual_byteen[qual_request_index];
assign reqq_req_addr_st0 = qual_addr[qual_request_index];
assign reqq_req_writedata_st0 = qual_writedata[qual_request_index];
if (CORE_TAG_ID_BITS != 0) begin
assign reqq_req_tag_st0 = qual_tag;
assign reqq_req_rw_st0 = qual_rw;
end else begin
assign reqq_req_tag_st0 = qual_tag[qual_request_index];
assign reqq_req_tag_st0 = qual_tag[qual_request_index];
assign reqq_req_rw_st0 = qual_rw[qual_request_index];
end
`DEBUG_BLOCK(

View File

@@ -41,10 +41,6 @@ module VX_cache #(
// Enable snoop forwarding
parameter SNOOP_FORWARDING = 0,
// Prefetcher
parameter PRFQ_SIZE = 1,
parameter PRFQ_STRIDE = 0,
// core request tag size
parameter CORE_TAG_WIDTH = 42,
@@ -70,7 +66,7 @@ module VX_cache #(
// Core request
input wire [NUM_REQUESTS-1:0] core_req_valid,
input wire [NUM_REQUESTS-1:0] core_req_rw,
input wire [`CORE_REQ_TAG_COUNT-1:0] core_req_rw,
input wire [NUM_REQUESTS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] core_req_data,
@@ -246,7 +242,7 @@ module VX_cache #(
for (genvar i = 0; i < NUM_BANKS; i++) begin
wire [NUM_REQUESTS-1:0] curr_bank_core_req_valid;
wire [NUM_REQUESTS-1:0] curr_bank_core_req_rw;
wire [`CORE_REQ_TAG_COUNT-1:0] curr_bank_core_req_rw;
wire [NUM_REQUESTS-1:0][WORD_SIZE-1:0] curr_bank_core_req_byteen;
wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] curr_bank_core_req_addr;
wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] curr_bank_core_req_tag;
@@ -427,9 +423,7 @@ module VX_cache #(
.BANK_LINE_SIZE (BANK_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE (WORD_SIZE),
.DFQQ_SIZE (DFQQ_SIZE),
.PRFQ_SIZE (PRFQ_SIZE),
.PRFQ_STRIDE (PRFQ_STRIDE)
.DFQQ_SIZE (DFQQ_SIZE)
) cache_dram_req_arb (
.clk (clk),
.reset (reset),

View File

@@ -8,10 +8,7 @@ module VX_cache_dram_req_arb #(
// Size of a word in bytes
parameter WORD_SIZE = 0,
// Dram Fill Req Queue Size
parameter DFQQ_SIZE = 0,
// Prefetcher
parameter PRFQ_SIZE = 1,
parameter PRFQ_STRIDE = 0
parameter DFQQ_SIZE = 0
) (
input wire clk,
input wire reset,
@@ -38,32 +35,9 @@ module VX_cache_dram_req_arb #(
input wire dram_req_ready
);
wire pref_pop;
wire pref_valid;
wire[`DRAM_ADDR_WIDTH-1:0] pref_addr;
wire dwb_valid;
wire dfqq_req;
wire dwb_valid;
wire dfqq_req;
assign pref_pop = !dwb_valid && !dfqq_req && dram_req_ready && pref_valid;
VX_prefetcher #(
.PRFQ_SIZE (PRFQ_SIZE),
.PRFQ_STRIDE (PRFQ_STRIDE),
.BANK_LINE_SIZE(BANK_LINE_SIZE),
.WORD_SIZE (WORD_SIZE)
) prfqq (
.clk (clk),
.reset (reset),
.dram_req (dram_req_valid && !dram_req_rw),
.dram_req_addr(dram_req_addr),
.pref_pop (pref_pop),
.pref_valid (pref_valid),
.pref_addr (pref_addr)
);
wire[`DRAM_ADDR_WIDTH-1:0] dfqq_req_addr;
`DEBUG_BEGIN
@@ -110,10 +84,10 @@ module VX_cache_dram_req_arb #(
assign per_bank_dram_wb_req_ready[i] = dram_req_ready && (dwb_bank == `BANK_BITS'(i));
end
assign dram_req_valid = dwb_valid || dfqq_req || pref_pop;
assign dram_req_valid = dwb_valid || dfqq_req;
assign dram_req_rw = dwb_valid;
assign dram_req_byteen = dwb_valid ? per_bank_dram_wb_req_byteen[dwb_bank] : {BANK_LINE_SIZE{1'b1}};
assign dram_req_addr = dwb_valid ? per_bank_dram_wb_req_addr[dwb_bank] : (dfqq_req ? dfqq_req_addr : pref_addr);
assign dram_req_addr = dwb_valid ? per_bank_dram_wb_req_addr[dwb_bank] : dfqq_req_addr;
assign {dram_req_data} = dwb_valid ? per_bank_dram_wb_req_data[dwb_bank] : 0;
endmodule

View File

@@ -1,71 +0,0 @@
`include "VX_cache_config.vh"
module VX_prefetcher #(
// Size of line inside a bank in bytes
parameter BANK_LINE_SIZE = 0,
// Size of a word in bytes
parameter WORD_SIZE = 0,
parameter PRFQ_SIZE = 1,
parameter PRFQ_STRIDE = 0
) (
input wire clk,
input wire reset,
input wire dram_req,
input wire[`DRAM_ADDR_WIDTH-1:0] dram_req_addr,
input wire pref_pop,
output wire pref_valid,
output wire[`DRAM_ADDR_WIDTH-1:0] pref_addr
);
reg[`LOG2UP(PRFQ_STRIDE):0] use_valid;
reg[`DRAM_ADDR_WIDTH-1:0] use_addr;
wire current_valid;
wire[`DRAM_ADDR_WIDTH-1:0] current_addr;
wire current_full;
wire current_empty;
assign current_valid = !current_empty;
wire update_use = ((use_valid == 0) || ((use_valid-1) == 0)) && current_valid;
VX_generic_queue #(
.DATAW(`DRAM_ADDR_WIDTH),
.SIZE(PRFQ_SIZE)
) pfq_queue (
.clk (clk),
.reset (reset),
.push (dram_req && !current_full && !pref_pop),
.data_in (dram_req_addr),
.pop (update_use),
.data_out(current_addr),
.empty (current_empty),
.full (current_full),
`UNUSED_PIN (size)
);
assign pref_valid = 0; // TODO use_valid != 0;
assign pref_addr = use_addr;
always @(posedge clk) begin
if (reset) begin
use_valid <= 0;
use_addr <= 0;
end else begin
if (update_use) begin
use_valid <= PRFQ_STRIDE;
use_addr <= current_addr + BANK_LINE_SIZE;
end else if (pref_valid && pref_pop) begin
use_valid <= use_valid - 1;
use_addr <= use_addr + BANK_LINE_SIZE;
end
end
end
endmodule

View File

@@ -161,10 +161,16 @@ module VX_fp_addmul #(
defparam mac_fp_mul.adder_input_clock = "none";
defparam mac_fp_mul.accum_adder_clock = "none";
`else
integer fadd_h, fsub_h, fmul_h;
initial begin
fadd_h = dpi_register();
fsub_h = dpi_register();
fmul_h = dpi_register();
end
always @(posedge clk) begin
dpi_fadd(0*LANES+i, enable, dataa[i], datab[i], result_add);
dpi_fsub(1*LANES+i, enable, dataa[i], datab[i], result_sub);
dpi_fmul(2*LANES+i, enable, dataa[i], datab[i], result_mul);
dpi_fadd(fadd_h, enable, dataa[i], datab[i], result_add);
dpi_fsub(fsub_h, enable, dataa[i], datab[i], result_sub);
dpi_fmul(fmul_h, enable, dataa[i], datab[i], result_mul);
end
`endif

View File

@@ -39,8 +39,12 @@ module VX_fp_div #(
.q (result[i])
);
`else
integer fdiv_h;
initial begin
fdiv_h = dpi_register();
end
always @(posedge clk) begin
dpi_fdiv(8*LANES+i, enable, dataa[i], datab[i], result[i]);
dpi_fdiv(fdiv_h, enable, dataa[i], datab[i], result[i]);
end
`endif
end

View File

@@ -53,9 +53,14 @@ module VX_fp_ftoi #(
.q (result_u)
);
`else
integer ftoi_h, ftou_h;
initial begin
ftoi_h = dpi_register();
ftou_h = dpi_register();
end
always @(posedge clk) begin
dpi_ftoi(10*LANES+i, enable, dataa[i], result_s);
dpi_ftou(11*LANES+i, enable, dataa[i], result_u);
dpi_ftoi(ftoi_h, enable, dataa[i], result_s);
dpi_ftou(ftou_h, enable, dataa[i], result_u);
end
`endif

View File

@@ -53,9 +53,14 @@ module VX_fp_itof #(
.q (result_u)
);
`else
integer itof_h, utof_h;
initial begin
itof_h = dpi_register();
utof_h = dpi_register();
end
always @(posedge clk) begin
dpi_itof(12*LANES+i, enable, dataa[i], result_s);
dpi_utof(13*LANES+i, enable, dataa[i], result_u);
dpi_itof(itof_h, enable, dataa[i], result_s);
dpi_utof(utof_h, enable, dataa[i], result_u);
end
`endif

View File

@@ -121,9 +121,14 @@ module VX_fp_madd #(
defparam mac_fp_msub.adder_input_clock = "0";
defparam mac_fp_msub.accum_adder_clock = "none";
`else
integer fmadd_h, fmsub_h;
initial begin
fmadd_h = dpi_register();
fmsub_h = dpi_register();
end
always @(posedge clk) begin
dpi_fmadd(3*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(4*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
dpi_fmadd(fmadd_h, enable, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(fmsub_h, enable, dataa[i], datab[i], datac[i], result_msub);
end
`endif

View File

@@ -37,8 +37,12 @@ module VX_fp_sqrt #(
.q (result[i])
);
`else
integer fsqrt_h;
initial begin
fsqrt_h = dpi_register();
end
always @(posedge clk) begin
dpi_fsqrt(9*LANES+i, enable, dataa[i], result[i]);
dpi_fsqrt(fsqrt_h, enable, dataa[i], result[i]);
end
`endif
end

View File

@@ -3,11 +3,13 @@
#include <unordered_map>
#include <vector>
#include <mutex>
#include <iostream>
#include "svdpi.h"
#include "verilated_vpi.h"
#include "VX_config.h"
extern "C" {
int dpi_register();
void dpi_fadd(int inst, bool enable, int a, int b, int* result);
void dpi_fsub(int inst, bool enable, int a, int b, int* result);
void dpi_fmul(int inst, bool enable, int a, int b, int* result);
@@ -66,19 +68,28 @@ union Float_t {
class Instances {
public:
ShiftRegister& get(int inst) {
mutex_.lock();
ShiftRegister& sr = instances_[inst];
return instances_.at(inst);
}
int allocate() {
mutex_.lock();
int inst = instances_.size();
instances_.resize(inst + 1);
mutex_.unlock();
return sr;
return inst;
}
private:
std::unordered_map<int, ShiftRegister> instances_;
std::vector<ShiftRegister> instances_;
std::mutex mutex_;
};
Instances instances;
int dpi_register() {
return instances.allocate();
}
void dpi_fadd(int inst, bool enable, int a, int b, int* result) {
ShiftRegister& sr = instances.get(inst);

View File

@@ -1,6 +1,8 @@
`ifndef FLOAT_DPI
`define FLOAT_DPI
import "DPI-C" context function int dpi_register();
import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, output int result);

View File

@@ -11,7 +11,7 @@ interface VX_cache_core_req_if #(
) ();
wire [NUM_REQUESTS-1:0] valid;
wire [NUM_REQUESTS-1:0] rw;
wire [`CORE_REQ_TAG_COUNT-1:0] rw;
wire [NUM_REQUESTS-1:0][WORD_SIZE-1:0] byteen;
wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] addr;
wire [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] data;

View File

@@ -42,7 +42,7 @@ gen-s:
verilator $(VF) -DNDEBUG $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG $(SINGLECORE)'
gen-sd:
verilator $(VF) $(SINGLECORE) -CFLAGS '$(CF) -g -O0 $(DBG) $(SINGLECORE)' --trace $(DBG)
verilator $(VF) -O0 $(SINGLECORE) -CFLAGS '$(CF) -O0 -g $(DBG) $(SINGLECORE)' --trace $(DBG)
gen-st:
verilator $(VF) -DNDEBUG $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(SINGLECORE)' --threads $(THREADS)
@@ -51,7 +51,7 @@ gen-m:
verilator $(VF) -DNDEBUG $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG $(MULTICORE)'
gen-md:
verilator $(VF) $(MULTICORE) -CFLAGS '$(CF) -g -O0 $(DBG) $(MULTICORE)' --trace $(DBG)
verilator $(VF) $(MULTICORE) -CFLAGS '$(CF) -O0 -g $(DBG) $(MULTICORE)' --trace $(DBG)
gen-mt:
verilator $(VF) -DNDEBUG $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(MULTICORE)' --threads $(THREADS)
@@ -60,7 +60,7 @@ build-s: gen-s
(cd obj_dir && make -j -f VVortex.mk)
build-sd: gen-sd
(cd obj_dir && make -j -f VVortex.mk)
(cd obj_dir && OPT_FAST="-O0 -g" make -j -f VVortex.mk)
build-st: gen-st
(cd obj_dir && make -j -f VVortex.mk)
@@ -69,7 +69,7 @@ build-m: gen-m
(cd obj_dir && make -j -f VVortex.mk)
build-md: gen-md
(cd obj_dir && make -j -f VVortex.mk)
(cd obj_dir && OPT_FAST="-O0 -g" make -j -f VVortex.mk)
build-mt: gen-mt
(cd obj_dir && make -j -f VVortex.mk)
@@ -79,7 +79,7 @@ run-s: build-s
(cd obj_dir && ./VVortex)
run-sd: build-sd
(cd obj_dir && ./VVortex)
(cd obj_dir && valgrind ./VVortex)
run-st: build-st
(cd obj_dir && ./VVortex)

View File

@@ -8,6 +8,9 @@
#define DRAM_RQ_SIZE 16
#define DRAM_STALLS_MODULO 16
#define VL_WDATA_GETW(lwp, i, n, w) \
VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w)
uint64_t timestamp = 0;
double sc_time_stamp() {
@@ -35,9 +38,18 @@ Simulator::Simulator() {
vortex_->trace(trace_, 99);
trace_->open("trace.vcd");
#endif
// reset the device
this->reset();
}
Simulator::~Simulator() {
for (auto& buf : print_bufs_) {
auto str = buf.second.str();
if (str.size()) {
std::cout << "#" << buf.first << ": " << buf.second.str() << std::endl;
}
}
#ifdef VCD_OUTPUT
trace_->close();
#endif
@@ -158,12 +170,20 @@ void Simulator::eval_dram_bus() {
}
void Simulator::eval_io_bus() {
if (vortex_->io_req_valid
&& vortex_->io_req_rw
&& ((vortex_->io_req_addr << 2) == IO_BUS_ADDR_COUT)) {
uint32_t data_write = (uint32_t)vortex_->io_req_data;
char c = (char)data_write;
std::cout << c;
for (int i = 0; i < NUM_THREADS; ++i) {
if (((vortex_->io_req_valid >> i) & 0x1)
&& ((VL_WDATA_GETW(vortex_->io_req_addr, i, NUM_THREADS, 30) << 2) == IO_BUS_ADDR_COUT)) {
assert(vortex_->io_req_rw);
int data = vortex_->io_req_data[i];
int tid = data >> 16;
char c = data & 0xff;
auto& ss_buf = print_bufs_[tid];
ss_buf << c;
if (c == '\n') {
std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush;
ss_buf.str("");
}
}
}
vortex_->io_req_ready = 1;
vortex_->io_rsp_valid = 0;
@@ -229,9 +249,15 @@ void Simulator::wait(uint32_t cycles) {
}
bool Simulator::is_busy() const {
return vortex_->busy
|| snp_req_active_
|| csr_req_active_;
return vortex_->busy;
}
bool Simulator::snp_req_active() const {
return snp_req_active_;
}
bool Simulator::csr_req_active() const {
return csr_req_active_;
}
void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) {
@@ -290,10 +316,7 @@ void Simulator::get_csr(int core_id, int addr, unsigned *value) {
void Simulator::run() {
#ifndef NDEBUG
std::cout << timestamp << ": [sim] run()" << std::endl;
#endif
// reset the device
this->reset();
#endif
// execute program
while (vortex_->busy

View File

@@ -13,6 +13,8 @@
#include <ostream>
#include <vector>
#include <sstream>
#include <unordered_map>
class Simulator {
public:
@@ -25,7 +27,10 @@ public:
void load_bin(const char* program_file);
void load_ihex(const char* program_file);
bool is_busy() const;
bool is_busy() const;
bool snp_req_active() const;
bool csr_req_active() const;
void reset();
void step();
@@ -48,6 +53,8 @@ private:
unsigned tag;
} dram_req_t;
std::unordered_map<int, std::stringstream> print_bufs_;
void eval();
void eval_dram_bus();