Merge branch 'master' into graphics

This commit is contained in:
Blaise Tine
2021-08-02 23:57:53 -07:00
61 changed files with 723 additions and 860462 deletions

View File

@@ -337,9 +337,10 @@
// SM Configurable Knobs //////////////////////////////////////////////////////
// per thread stack size
`ifndef STACK_SIZE
`define STACK_SIZE 1024
`ifndef STACK_LOG2_SIZE
`define STACK_LOG2_SIZE 10
`endif
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
// Size of cache in bytes
`ifndef SMEM_SIZE

View File

@@ -91,8 +91,8 @@ module VX_ibuffer #(
///////////////////////////////////////////////////////////////////////////
reg [`NUM_WARPS-1:0] valid_table, valid_table_n;
reg [`NUM_WARPS-1:0] schedule_table, schedule_table_n;
reg [`NW_BITS-1:0] deq_wid, deq_wid_n;
reg [`NW_BITS-1:0] deq_wid_rr, deq_wid_rr_n;
reg deq_valid, deq_valid_n;
reg [DATAW-1:0] deq_instr, deq_instr_n;
reg [NWARPSW-1:0] num_warps;
@@ -108,34 +108,34 @@ module VX_ibuffer #(
end
end
// round-robin warp scheduling
VX_rr_arbiter #(
.NUM_REQS (`NUM_WARPS)
) rr_arbiter (
.clk (clk),
.reset (reset),
.enable (ibuffer_if.ready),
.requests (valid_table_n),
.grant_index (deq_wid_rr_n),
`UNUSED_PIN (grant_onehot),
`UNUSED_PIN (grant_valid)
);
// schedule the next instruction to issue
always @(*) begin
deq_valid_n = enq_fire;
deq_wid_n = decode_if.wid;
deq_instr_n = q_data_in;
if (num_warps > 1) begin
deq_valid_n = 1;
for (integer i = `NUM_WARPS-1; i >= 0; --i) begin
if (schedule_table[i]) begin
deq_wid_n = `NW_BITS'(i);
deq_instr_n = q_data_out[i];
end
end
deq_wid_n = deq_wid_rr;
deq_instr_n = q_data_out[deq_wid_rr];
end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin
deq_valid_n = 1;
deq_wid_n = deq_wid;
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
end
end
// do round-robin scheduling with multiple active warps
always @(*) begin
if ($countones(schedule_table) <= 1) begin
schedule_table_n = valid_table_n;
end else begin
schedule_table_n = schedule_table;
deq_valid_n = enq_fire;
deq_wid_n = decode_if.wid;
deq_instr_n = q_data_in;
end
schedule_table_n[deq_wid_n] = 0;
end
wire warp_added = enq_fire && q_empty[decode_if.wid];
@@ -143,14 +143,14 @@ module VX_ibuffer #(
always @(posedge clk) begin
if (reset) begin
valid_table <= 0;
deq_valid <= 0;
num_warps <= 0;
schedule_table <= 0;
valid_table <= 0;
deq_valid <= 0;
num_warps <= 0;
deq_wid_rr <= 0;
end else begin
valid_table <= valid_table_n;
deq_valid <= deq_valid_n;
schedule_table <= schedule_table_n;
valid_table <= valid_table_n;
deq_valid <= deq_valid_n;
deq_wid_rr <= deq_wid_rr_n;
if (warp_added && !warp_removed) begin
num_warps <= num_warps + NWARPSW'(1);

View File

@@ -38,7 +38,8 @@ module VX_instr_demux (
wire alu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_ALU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32))
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
) alu_buffer (
.clk (clk),
.reset (reset),
@@ -56,7 +57,8 @@ module VX_instr_demux (
wire lsu_is_fence = `LSU_IS_FENCE(ibuffer_if.op_mod);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32))
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
) lsu_buffer (
.clk (clk),
.reset (reset),
@@ -73,7 +75,8 @@ module VX_instr_demux (
wire csr_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_CSR);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32),
.OUTPUT_REG (1)
) csr_buffer (
.clk (clk),
.reset (reset),
@@ -91,7 +94,8 @@ module VX_instr_demux (
wire fpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_FPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32))
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
) fpu_buffer (
.clk (clk),
.reset (reset),
@@ -112,7 +116,8 @@ module VX_instr_demux (
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) //update number of bits
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
) gpu_buffer (
.clk (clk),
.reset (reset),

View File

@@ -339,17 +339,17 @@ module VX_lsu_unit #(
if (dcache_req_if.rw[0]) begin
$write("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);
$write(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`PRINT_ARRAY1D(req_addr_type, `NUM_THREADS);
$write(", data=");
`PRINT_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
$write("\n");
$write(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`PRINT_ARRAY1D(req_addr_type, `NUM_THREADS);
$write(", data=");
`PRINT_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
$write("\n");
end else begin
$write("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
$write("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);
$write(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`PRINT_ARRAY1D(req_addr_type, `NUM_THREADS);
$write(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup);
$write(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`PRINT_ARRAY1D(req_addr_type, `NUM_THREADS);
$write(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup);
end
end
if (dcache_rsp_fire) begin

View File

@@ -287,13 +287,14 @@ module VX_mem_unit # (
assign dcache_req_tmp_if.byteen = dcache_req_if.byteen;
assign dcache_req_tmp_if.data = dcache_req_if.data;
assign dcache_req_tmp_if.tag = dcache_req_if.tag;
assign dcache_req_tmp_if.ready = dcache_req_if.ready;
assign dcache_req_if.ready = dcache_req_tmp_if.ready;
// D-cache to core reponse
assign dcache_rsp_if.valid = dcache_rsp_tmp_if.valid;
assign dcache_rsp_if.tmask = dcache_rsp_tmp_if.tmask;
assign dcache_rsp_if.tag = dcache_rsp_tmp_if.tag;
assign dcache_rsp_if.data = dcache_rsp_tmp_if.data;
assign dcache_rsp_if.ready = dcache_rsp_tmp_if.ready;
assign dcache_rsp_tmp_if.ready = dcache_rsp_if.ready;
end
wire [`DMEM_TAG_WIDTH-1:0] icache_mem_req_tag = `DMEM_TAG_WIDTH'(icache_mem_req_if.tag);

View File

@@ -92,7 +92,7 @@ module VX_cache #(
`STATIC_ASSERT(NUM_PORTS <= NUM_BANKS, ("invalid value"))
localparam CORE_TAG_X_WIDTH = CORE_TAG_WIDTH - NC_ENABLE;
localparam CORE_TAG_ID_X_BITS = CORE_TAG_ID_BITS - NC_ENABLE;
localparam CORE_TAG_ID_X_BITS = (CORE_TAG_ID_BITS != 0) ? (CORE_TAG_ID_BITS - NC_ENABLE) : CORE_TAG_ID_BITS;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
@@ -125,13 +125,13 @@ module VX_cache #(
wire [CACHE_LINE_SIZE-1:0] mem_req_byteen_nc;
wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc;
wire [`CACHE_LINE_WIDTH-1:0] mem_req_data_nc;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_nc;
wire [`MEM_ADDR_WIDTH-1:0] mem_req_tag_nc;
wire mem_req_ready_nc;
// Memory response
wire mem_rsp_valid_nc;
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_nc;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_nc;
wire [`MEM_ADDR_WIDTH-1:0] mem_rsp_tag_nc;
wire mem_rsp_ready_nc;
if (NC_ENABLE) begin
@@ -146,7 +146,8 @@ module VX_cache #(
.MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH),
.MEM_DATA_SIZE (CACHE_LINE_SIZE),
.MEM_TAG_WIDTH (MEM_TAG_WIDTH)
.MEM_TAG_IN_WIDTH (`MEM_ADDR_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH)
) nc_bypass (
.clk (clk),
.reset (reset),
@@ -245,12 +246,9 @@ module VX_cache #(
///////////////////////////////////////////////////////////////////////////
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_qual;
wire [`MEM_ADDR_WIDTH-1:0] mem_rsp_tag_nc_a, mem_rsp_tag_qual;
wire [`MEM_ADDR_WIDTH-1:0] mem_rsp_tag_qual;
wire mrsq_out_valid, mrsq_out_ready;
// trim out non-cacheable flags
assign mem_rsp_tag_nc_a = mem_rsp_tag_nc[NC_ENABLE +: `MEM_ADDR_WIDTH];
VX_elastic_buffer #(
.DATAW (`MEM_ADDR_WIDTH + `CACHE_LINE_WIDTH),
@@ -261,7 +259,7 @@ module VX_cache #(
.reset (reset),
.ready_in (mem_rsp_ready_nc),
.valid_in (mem_rsp_valid_nc),
.data_in ({mem_rsp_tag_nc_a, mem_rsp_data_nc}),
.data_in ({mem_rsp_tag_nc, mem_rsp_data_nc}),
.data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}),
.ready_out (mrsq_out_ready),
.valid_out (mrsq_out_valid)
@@ -545,12 +543,7 @@ module VX_cache #(
.ready_out (mem_req_ready_nc)
);
// build memory tag adding non-cacheable flag
if (NC_ENABLE) begin
assign mem_req_tag_nc = MEM_TAG_WIDTH'({mem_req_addr_nc, 1'b0});
end else begin
assign mem_req_tag_nc = MEM_TAG_WIDTH'(mem_req_addr_nc);
end
assign mem_req_tag_nc = mem_req_addr_nc;
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes

View File

@@ -69,4 +69,4 @@
`define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)}
`endif
`endif

View File

@@ -11,7 +11,8 @@ module VX_nc_bypass #(
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_DATA_SIZE = 1,
parameter MEM_TAG_WIDTH = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
localparam CORE_DATA_WIDTH = CORE_DATA_SIZE * 8,
localparam MEM_DATA_WIDTH = MEM_DATA_SIZE * 8,
@@ -58,7 +59,7 @@ module VX_nc_bypass #(
input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in,
input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in,
input wire [MEM_TAG_WIDTH-1:0] mem_req_tag_in,
input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
// Memory request out
@@ -67,19 +68,19 @@ module VX_nc_bypass #(
output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out,
output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag_out,
output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
// Memory response in
input wire mem_rsp_valid_in,
input wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_in,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_in,
input wire [MEM_TAG_OUT_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_ready_in,
// Memory response out
output wire mem_rsp_valid_out,
output wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_out,
output wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_out,
output wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_out,
input wire mem_rsp_ready_out
);
`STATIC_ASSERT((NUM_RSP_TAGS == 1 || NUM_RSP_TAGS == NUM_REQS), ("invalid paramter"))
@@ -129,7 +130,7 @@ module VX_nc_bypass #(
.N (CORE_TAG_IN_WIDTH),
.S (1),
.POS (NC_TAG_BIT)
) bits_remove (
) core_req_tag_remove (
.data_in (core_req_tag_in[i]),
.data_out (core_req_tag_out[i])
);
@@ -150,6 +151,18 @@ module VX_nc_bypass #(
assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid;
assign mem_req_ready_in = mem_req_ready_out;
wire [(MEM_TAG_IN_WIDTH+1)-1:0] mem_req_tag_in_nc;
VX_bits_insert #(
.N (MEM_TAG_IN_WIDTH),
.S (1),
.POS (NC_TAG_BIT)
) mem_req_tag_insert (
.data_in (mem_req_tag_in),
.sel_in ('0),
.data_out (mem_req_tag_in_nc)
);
if (NUM_REQS > 1) begin
wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel;
@@ -188,10 +201,10 @@ module VX_nc_bypass #(
mem_req_byteen_in_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in_sel;
end
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
assign mem_req_tag_out = mem_req_valid_in ? mem_req_tag_in : MEM_TAG_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel});
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel});
end else begin
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel;
assign mem_req_tag_out = mem_req_valid_in ? mem_req_tag_in : MEM_TAG_WIDTH'({core_req_nc_tid, core_req_tag_in_sel});
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, core_req_tag_in_sel});
end
end else begin
`UNUSED_VAR (core_req_nc_tid)
@@ -212,10 +225,10 @@ module VX_nc_bypass #(
mem_req_byteen_in_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in;
end
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
assign mem_req_tag_out = mem_req_valid_in ? mem_req_tag_in : MEM_TAG_WIDTH'({req_addr_idx, core_req_tag_in});
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({req_addr_idx, core_req_tag_in});
end else begin
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in;
assign mem_req_tag_out = mem_req_valid_in ? mem_req_tag_in : MEM_TAG_WIDTH'(core_req_tag_in);
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'(core_req_tag_in);
end
end
@@ -230,7 +243,7 @@ module VX_nc_bypass #(
.N (CORE_TAG_OUT_WIDTH),
.S (1),
.POS (NC_TAG_BIT)
) bits_remove (
) core_rsp_tag_insert (
.data_in (core_rsp_tag_in[i]),
.sel_in ('0),
.data_out (core_rsp_tag_out_unqual[i])
@@ -298,7 +311,15 @@ module VX_nc_bypass #(
assign mem_rsp_valid_out = mem_rsp_valid_in && ~mem_rsp_tag_in[NC_TAG_BIT];
assign mem_rsp_data_out = mem_rsp_data_in;
assign mem_rsp_tag_out = mem_rsp_tag_in;
VX_bits_remove #(
.N (MEM_TAG_IN_WIDTH+1),
.S (1),
.POS (NC_TAG_BIT)
) mem_rsp_tag_remove (
.data_in (mem_rsp_tag_in[(MEM_TAG_IN_WIDTH+1)-1:0]),
.data_out (mem_rsp_tag_out)
);
if (NUM_RSP_TAGS > 1) begin
wire [CORE_REQ_TIDW-1:0] rsp_tid = mem_rsp_tag_in[(CORE_TAG_IN_WIDTH + D) +: CORE_REQ_TIDW];
@@ -307,4 +328,4 @@ module VX_nc_bypass #(
assign mem_rsp_ready_in = is_mem_rsp_nc ? (~core_rsp_valid_in && core_rsp_ready_out) : mem_rsp_ready_out;
end
endmodule
endmodule

View File

@@ -241,7 +241,7 @@ def expand_text(text, params):
while True:
if iter > 65536:
raise Exception("Macro recursion!")
raise Exception("Macro recursion!")
has_func = False
while True:
params_updated = False